import requests from bs4 import BeautifulSoup import json import re import pandas as pd # 构建 URL url = "https://mobile.yangkeduo.com/search_result.html" params = { "search_key": "卡奇尔", "search_type": "goods", "source": "index", "options": 1, "search_met_track": "manual", "refer_page_el_sn": 99885, "refer_page_name": "psnl_verification", "refer_page_id": "10390_1719041565192_kvy50ivy6o", "refer_page_sn": 10390 } headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'cookie': 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLdNmQInGaEaEPtGsoGbsmdtsoELyywIGbxQmGxymPdGxGmPsExGwtyidnw; pdd_vds=gaDMTZbYuZDflZnzbceqTYDCnWbznhxClhbfxYncxcupeXmHsMeflZNvmWBf', 'priority': 'u=0, i', 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' } search_url = "https://mobile.yangkeduo.com/proxy/api/search" search_params = { 'pdduid': ['6082723443128'], 'item_ver': ['lzqq'], 'coupon_price_flag': ['1'], 'source': ['index'], 'search_met': ['manual'], 'track_data': ['refer_page_id,10390_1719049931008_5u3fqawayd'], 'list_id': ['es8f7ccl2j'], 'sort': ['default'], 'q': ['卡奇尔'], 'page': ['1'], 'is_new_query': ['1'], 'size': ['50'], 'flip': [ '0;0;0;0;5bf1496c-7eeb-d769-a083-82c0d12ed6ea;/20;0;0;7b959a79a7090fc30d130739d29aa8bc' ], 'anti_content': [ '0asWtqzyuiHy89e2f4yBpVQxDnKq0cKpaCDfBqdIBEXkFJmcmkUuqc0G7SCjqeyqkOjm8S5weKE2vgBVVZWAcZsNQ_2Zn-8jAcvyPgPvy_oeYAZnX4xfwQv2irT-bCSWp3olgt_ye8J1fmK0JwCv4grwJb2Lch9K30JiQQtorqeKXbT82rqDuZRqBCLCKaL6T9M_M8nCDsJeiprOkfyCKqLiWYbFyu_Lcq8fBlYfGTNQJqQlOC-COwVxTvhneqlBLDsvK6hclKPASBI57JBwJISDAwrqQcDtBvw83KDcAKKqnkYLmZNlD3qez967wH002cExn_KU9whqHeaZ-r4aijQg8zl2qPfVzj5eOCXuVSIbm1amTHVrnOdRCQ4aHGqPX9kvzdp51p3qgMuaKIGP5VpXjmryr7cbceiTd90XfHjyQR7N_JdOSwMS7t2OIaJdeAVYrVn4V32sZVywBv2j906jH-yEkKgIhcDD2Fj3FY419v1t7_9u5Lf743A9tmk_F20YdDW2XlB2ihPbCU2Jm09VS3mr35U-ELq3cIL3pIHYMKHOsxec2mHxc8hW58KsJstEekc7TskebFNRQCtlWefoR61qwaHNxPo5GX9NCXLMzFY0bMiaUuIlR4GQHQyH4tBm77YYLQoPMcdliO6HLRyi2GNmONE6gJTLmFIydqOldamgVfSpF6wG8VlCir7fn4hwOJErGIENrvW4vYKi6kQrym8Kcgfk3SOT1eJ1T9f' ] } search_headers = { 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gajYSeplVNVezYFYkdjDMDJeKTFugxKNFCgCSTFxgdKDWupuVcHLkYjLHYSL; pdd_vds=gaDMTZbYuZDflZnzbceqTYDCnWbznhxClhbfxYncxcupeXmHsMeflZNvmWBf', 'priority': 'u=1, i', 'referer': 'https://mobile.yangkeduo.com/search_result.html?search_key=%E5%8D%A1%E5%A5%87%E5%B0%94&search_type=goods&source=index&options=1&search_met_track=manual&refer_page_el_sn=99884&refer_page_name=psnl_verification&refer_page_id=10390_1719054832104_uglp9uac0t&refer_page_sn=10390&page_id=10015_1719053308384_qv1mgz2pki&is_back=&bsch_is_search_mall=&bsch_show_active_page=&flip=0%3B0%3B0%3B0%3B726dcdf4-a6ae-210a-6afa-e4f679122426%3B%2F40%3B0%3B0%3Bd2d149c9d280203c602fa6a747caa3c2&sort_type=default&price_index=-1&filter=&opt_tag_name=&brand_tab_filter=&list_id=x8v4ig72bl', 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 'verifyauthtoken': 'lqZp2RtQSqDimdHPpxljpA49eabc468299e2d76' } # 自定义异常类 class ConversionError(Exception): pass def extract_data_from_html(content): # 使用 BeautifulSoup 解析 HTML 结果 soup = BeautifulSoup(content, "html.parser") # 找到 body 标签 body_tag = soup.find("body") # 在 body 标签内部找到第一级的 script 标签 first_level_scripts = body_tag.find_all("script", recursive=False) # 遍历第一级的 script 标签,并将其内容转换为 Python 字典 for script_tag in first_level_scripts: script_content = script_tag.string compressed_string = re.sub(r"\n", "", script_content) if compressed_string: # 使用正则表达式提取 window.rawData 赋值语句 match = re.search(r"window\.rawData\s*=\s*(\{.+?\});", compressed_string) if match: raw_data_value = match.group(1) # 尝试使用 json.loads() 将值转换为 Python 字典对象 try: raw_data = json.loads(raw_data_value) except (ValueError, TypeError) as e: # 如果 JSON 解析失败,则使用 eval() 函数尝试解析 raw_data = eval(raw_data_value) print(f"Error converting value : {e}") raise ConversionError(f"Error converting value : {e}") return raw_data return None def write_stores_data_to_excel(data): if data: try: stores = data['stores']['store']['data']['ssrListData']['list'] searchKey = data['stores']['store']['data']['ssrListData'][ 'searchKey'] except (KeyError, TypeError) as e: print(f"write_stores_data_to_excel: Error parse value : {e}") stores = None else: print("No data to write to Excel.") stores = None return stores def write_goods_data_to_excel(data, search_key): if data: try: goods = [] items = data['items'] for item in items: goods_model = item['item_data']['goods_model'] goods_id = goods_model['goods_id'] goods_name = goods_model['goods_name'] link_url = goods_model['link_url'] print(goods_name) if search_key in goods_name: good = { "goods_id": goods_id, "goods_name": goods_name, "link_url": link_url } # 将 JSON 对象添加到 JSON 数组中 goods.append(good) except (KeyError, TypeError) as e: print(f"write_goods_data_to_excel: Error parse value : {e}") else: goods = None print("No data to write to Excel.") return goods if __name__ == "__main__": try: search_key = '卡奇尔' output_file = f"output_{search_key}.xlsx" combined_list = [] for i in range(10): search_params['page'] = [i] search_params['q'] = [search_key] response = requests.request("GET", search_url, headers=search_headers, params=search_params) if response.status_code == 200: data = response.json() goods = write_goods_data_to_excel(data, search_key) if goods: combined_list.extend(goods) if len(combined_list) > 50: break else: # 请求失败,处理错误 print( f"API Request failed with status code: {response.status_code}" ) break if len(combined_list) != 0: df = pd.DataFrame(combined_list) # 将 DataFrame 写入 Excel 文件 df.to_excel(output_file, index=False, engine='xlsxwriter') else: print("No data found in the JSON file.") except ConversionError as e: print(e) except Exception as e: print(f"An error occurred: {e}")