main.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import re
  5. import pandas as pd
  6. # 构建 URL
  7. url = "https://mobile.yangkeduo.com/search_result.html"
  8. params = {
  9. "search_key": "卡奇尔",
  10. "search_type": "goods",
  11. "source": "index",
  12. "options": 1,
  13. "search_met_track": "manual",
  14. "refer_page_el_sn": 99885,
  15. "refer_page_name": "psnl_verification",
  16. "refer_page_id": "10390_1719041565192_kvy50ivy6o",
  17. "refer_page_sn": 10390
  18. }
  19. payload = {}
  20. headers = {
  21. 'accept':
  22. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  23. 'accept-language':
  24. 'zh-CN,zh;q=0.9',
  25. 'cache-control':
  26. 'max-age=0',
  27. 'cookie':
  28. 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLLNOQonbnLInGENOaiEionoNLiNOIotILGmynNOILtGLPQmNPoNmiOoQNo; pdd_vds=gaLLNOQonbnOGQaEGbiIPyiaEatOiELtGtiELILONnIInGmoNPGtmmnINEiP',
  29. 'priority':
  30. 'u=0, i',
  31. 'sec-ch-ua':
  32. '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
  33. 'sec-ch-ua-mobile':
  34. '?0',
  35. 'sec-ch-ua-platform':
  36. '"macOS"',
  37. 'sec-fetch-dest':
  38. 'document',
  39. 'sec-fetch-mode':
  40. 'navigate',
  41. 'sec-fetch-site':
  42. 'none',
  43. 'sec-fetch-user':
  44. '?1',
  45. 'upgrade-insecure-requests':
  46. '1',
  47. 'user-agent':
  48. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
  49. }
  50. # 自定义异常类
  51. class ConversionError(Exception):
  52. pass
  53. def extract_data_from_html():
  54. response = requests.request("GET", url, headers=headers, params=params)
  55. # 使用 BeautifulSoup 解析 HTML 结果
  56. soup = BeautifulSoup(response.content, "html.parser")
  57. # 找到 body 标签
  58. body_tag = soup.find("body")
  59. # 在 body 标签内部找到第一级的 script 标签
  60. first_level_scripts = body_tag.find_all("script", recursive=False)
  61. # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
  62. for script_tag in first_level_scripts:
  63. script_content = script_tag.string
  64. compressed_string = re.sub(r"\n", "", script_content)
  65. if compressed_string:
  66. # 使用正则表达式提取 window.rawData 赋值语句
  67. match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
  68. compressed_string)
  69. if match:
  70. raw_data_value = match.group(1)
  71. print(raw_data_value)
  72. # 尝试使用 json.loads() 将值转换为 Python 字典对象
  73. try:
  74. raw_data = json.loads(raw_data_value)
  75. except (ValueError, TypeError) as e:
  76. # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
  77. raw_data = eval(raw_data_value)
  78. print(f"Error converting value : {e}")
  79. raise ConversionError(f"Error converting value : {e}")
  80. return raw_data
  81. return None
  82. def write_data_to_excel(data, columns=None):
  83. if data:
  84. try:
  85. list = data['stores']['store']['data']['ssrListData']['list']
  86. searchKey = data['stores']['store']['data']['ssrListData'][
  87. 'searchKey']
  88. except (KeyError, TypeError) as e:
  89. print(f"Error parse value : {e}")
  90. list = None
  91. if list:
  92. df = pd.DataFrame(list)
  93. # 如果用户指定了列名,则使用指定的列名
  94. if columns:
  95. df = df[columns]
  96. # 将 DataFrame 写入 Excel 文件
  97. output_file = f"output_{searchKey}.xlsx"
  98. df.to_excel(output_file, index=False, engine='xlsxwriter')
  99. else:
  100. print("No data found in the JSON file.")
  101. else:
  102. print("No data to write to Excel.")
  103. if __name__ == "__main__":
  104. try:
  105. raw_data = extract_data_from_html()
  106. write_data_to_excel(
  107. raw_data, columns=['goodsID', 'goodsName', 'goodsName', 'linkURL'])
  108. except ConversionError as e:
  109. print(e)
  110. except Exception as e:
  111. print(f"An error occurred: {e}")