|
@@ -0,0 +1,128 @@
|
|
|
|
+import requests
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+import json
|
|
|
|
+import re
|
|
|
|
+import pandas as pd
|
|
|
|
+
|
|
|
|
+# 构建 URL
|
|
|
|
+url = "https://mobile.yangkeduo.com/search_result.html"
|
|
|
|
+params = {
|
|
|
|
+ "search_key": "卡奇尔",
|
|
|
|
+ "search_type": "goods",
|
|
|
|
+ "source": "index",
|
|
|
|
+ "options": 1,
|
|
|
|
+ "search_met_track": "manual",
|
|
|
|
+ "refer_page_el_sn": 99885,
|
|
|
|
+ "refer_page_name": "psnl_verification",
|
|
|
|
+ "refer_page_id": "10390_1719041565192_kvy50ivy6o",
|
|
|
|
+ "refer_page_sn": 10390
|
|
|
|
+}
|
|
|
|
+payload = {}
|
|
|
|
+headers = {
|
|
|
|
+ 'accept':
|
|
|
|
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
|
|
+ 'accept-language':
|
|
|
|
+ 'zh-CN,zh;q=0.9',
|
|
|
|
+ 'cache-control':
|
|
|
|
+ 'max-age=0',
|
|
|
|
+ 'cookie':
|
|
|
|
+ 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLLNOQonbnLInGENOaiEionoNLiNOIotILGmynNOILtGLPQmNPoNmiOoQNo; pdd_vds=gaLLNOQonbnOGQaEGbiIPyiaEatOiELtGtiELILONnIInGmoNPGtmmnINEiP',
|
|
|
|
+ 'priority':
|
|
|
|
+ 'u=0, i',
|
|
|
|
+ 'sec-ch-ua':
|
|
|
|
+ '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
|
|
|
|
+ 'sec-ch-ua-mobile':
|
|
|
|
+ '?0',
|
|
|
|
+ 'sec-ch-ua-platform':
|
|
|
|
+ '"macOS"',
|
|
|
|
+ 'sec-fetch-dest':
|
|
|
|
+ 'document',
|
|
|
|
+ 'sec-fetch-mode':
|
|
|
|
+ 'navigate',
|
|
|
|
+ 'sec-fetch-site':
|
|
|
|
+ 'none',
|
|
|
|
+ 'sec-fetch-user':
|
|
|
|
+ '?1',
|
|
|
|
+ 'upgrade-insecure-requests':
|
|
|
|
+ '1',
|
|
|
|
+ 'user-agent':
|
|
|
|
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 自定义异常类
|
|
|
|
+class ConversionError(Exception):
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_data_from_html():
|
|
|
|
+ response = requests.request("GET", url, headers=headers, params=params)
|
|
|
|
+
|
|
|
|
+ # 使用 BeautifulSoup 解析 HTML 结果
|
|
|
|
+ soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
+
|
|
|
|
+ # 找到 body 标签
|
|
|
|
+ body_tag = soup.find("body")
|
|
|
|
+
|
|
|
|
+ # 在 body 标签内部找到第一级的 script 标签
|
|
|
|
+ first_level_scripts = body_tag.find_all("script", recursive=False)
|
|
|
|
+
|
|
|
|
+ # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
|
|
|
|
+ for script_tag in first_level_scripts:
|
|
|
|
+ script_content = script_tag.string
|
|
|
|
+ compressed_string = re.sub(r"\n", "", script_content)
|
|
|
|
+ if compressed_string:
|
|
|
|
+ # 使用正则表达式提取 window.rawData 赋值语句
|
|
|
|
+ match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
|
|
|
|
+ compressed_string)
|
|
|
|
+ if match:
|
|
|
|
+ raw_data_value = match.group(1)
|
|
|
|
+ print(raw_data_value)
|
|
|
|
+ # 尝试使用 json.loads() 将值转换为 Python 字典对象
|
|
|
|
+ try:
|
|
|
|
+ raw_data = json.loads(raw_data_value)
|
|
|
|
+ except (ValueError, TypeError) as e:
|
|
|
|
+ # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
|
|
|
|
+ raw_data = eval(raw_data_value)
|
|
|
|
+ print(f"Error converting value : {e}")
|
|
|
|
+ raise ConversionError(f"Error converting value : {e}")
|
|
|
|
+
|
|
|
|
+ return raw_data
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def write_data_to_excel(data, columns=None):
|
|
|
|
+ if data:
|
|
|
|
+ try:
|
|
|
|
+ list = data['stores']['store']['data']['ssrListData']['list']
|
|
|
|
+ searchKey = data['stores']['store']['data']['ssrListData'][
|
|
|
|
+ 'searchKey']
|
|
|
|
+ except (KeyError, TypeError) as e:
|
|
|
|
+ print(f"Error parse value : {e}")
|
|
|
|
+ list = None
|
|
|
|
+
|
|
|
|
+ if list:
|
|
|
|
+ df = pd.DataFrame(list)
|
|
|
|
+
|
|
|
|
+ # 如果用户指定了列名,则使用指定的列名
|
|
|
|
+ if columns:
|
|
|
|
+ df = df[columns]
|
|
|
|
+
|
|
|
|
+ # 将 DataFrame 写入 Excel 文件
|
|
|
|
+ output_file = f"output_{searchKey}.xlsx"
|
|
|
|
+ df.to_excel(output_file, index=False, engine='xlsxwriter')
|
|
|
|
+ else:
|
|
|
|
+ print("No data found in the JSON file.")
|
|
|
|
+ else:
|
|
|
|
+ print("No data to write to Excel.")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ try:
|
|
|
|
+ raw_data = extract_data_from_html()
|
|
|
|
+ write_data_to_excel(
|
|
|
|
+ raw_data, columns=['goodsID', 'goodsName', 'goodsName', 'linkURL'])
|
|
|
|
+ except ConversionError as e:
|
|
|
|
+ print(e)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print(f"An error occurred: {e}")
|