main.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import re
  5. import pandas as pd
  6. # 构建 URL
  7. url = "https://mobile.yangkeduo.com/search_result.html"
  8. params = {
  9. "search_key": "卡奇尔",
  10. "search_type": "goods",
  11. "source": "index",
  12. "options": 1,
  13. "search_met_track": "manual",
  14. "refer_page_el_sn": 99885,
  15. "refer_page_name": "psnl_verification",
  16. "refer_page_id": "10390_1719041565192_kvy50ivy6o",
  17. "refer_page_sn": 10390
  18. }
  19. headers = {
  20. 'accept':
  21. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  22. 'accept-language':
  23. 'zh-CN,zh;q=0.9',
  24. 'cache-control':
  25. 'max-age=0',
  26. 'cookie':
  27. 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLdNmQInGaEaEPtGsoGbsmdtsoELyywIGbxQmGxymPdGxGmPsExGwtyidnw; pdd_vds=gaDMTZbYuZDflZnzbceqTYDCnWbznhxClhbfxYncxcupeXmHsMeflZNvmWBf',
  28. 'priority':
  29. 'u=0, i',
  30. 'sec-ch-ua':
  31. '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
  32. 'sec-ch-ua-mobile':
  33. '?0',
  34. 'sec-ch-ua-platform':
  35. '"macOS"',
  36. 'sec-fetch-dest':
  37. 'document',
  38. 'sec-fetch-mode':
  39. 'navigate',
  40. 'sec-fetch-site':
  41. 'none',
  42. 'sec-fetch-user':
  43. '?1',
  44. 'upgrade-insecure-requests':
  45. '1',
  46. 'user-agent':
  47. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
  48. }
  49. search_url = "https://mobile.yangkeduo.com/proxy/api/search"
  50. search_params = {
  51. 'pdduid': ['6082723443128'],
  52. 'item_ver': ['lzqq'],
  53. 'coupon_price_flag': ['1'],
  54. 'source': ['index'],
  55. 'search_met': ['manual'],
  56. 'track_data': ['refer_page_id,10390_1719049931008_5u3fqawayd'],
  57. 'list_id': ['es8f7ccl2j'],
  58. 'sort': ['default'],
  59. 'q': ['卡奇尔'],
  60. 'page': ['1'],
  61. 'is_new_query': ['1'],
  62. 'size': ['50'],
  63. 'flip': [
  64. '0;0;0;0;5bf1496c-7eeb-d769-a083-82c0d12ed6ea;/20;0;0;7b959a79a7090fc30d130739d29aa8bc'
  65. ],
  66. 'anti_content': [
  67. '0asWtqzyuiHy89e2f4yBpVQxDnKq0cKpaCDfBqdIBEXkFJmcmkUuqc0G7SCjqeyqkOjm8S5weKE2vgBVVZWAcZsNQ_2Zn-8jAcvyPgPvy_oeYAZnX4xfwQv2irT-bCSWp3olgt_ye8J1fmK0JwCv4grwJb2Lch9K30JiQQtorqeKXbT82rqDuZRqBCLCKaL6T9M_M8nCDsJeiprOkfyCKqLiWYbFyu_Lcq8fBlYfGTNQJqQlOC-COwVxTvhneqlBLDsvK6hclKPASBI57JBwJISDAwrqQcDtBvw83KDcAKKqnkYLmZNlD3qez967wH002cExn_KU9whqHeaZ-r4aijQg8zl2qPfVzj5eOCXuVSIbm1amTHVrnOdRCQ4aHGqPX9kvzdp51p3qgMuaKIGP5VpXjmryr7cbceiTd90XfHjyQR7N_JdOSwMS7t2OIaJdeAVYrVn4V32sZVywBv2j906jH-yEkKgIhcDD2Fj3FY419v1t7_9u5Lf743A9tmk_F20YdDW2XlB2ihPbCU2Jm09VS3mr35U-ELq3cIL3pIHYMKHOsxec2mHxc8hW58KsJstEekc7TskebFNRQCtlWefoR61qwaHNxPo5GX9NCXLMzFY0bMiaUuIlR4GQHQyH4tBm77YYLQoPMcdliO6HLRyi2GNmONE6gJTLmFIydqOldamgVfSpF6wG8VlCir7fn4hwOJErGIENrvW4vYKi6kQrym8Kcgfk3SOT1eJ1T9f'
  68. ]
  69. }
  70. search_headers = {
  71. 'accept': 'application/json, text/plain, */*',
  72. 'accept-language': 'zh-CN,zh;q=0.9',
  73. 'cookie':
  74. 'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gajYSeplVNVezYFYkdjDMDJeKTFugxKNFCgCSTFxgdKDWupuVcHLkYjLHYSL; pdd_vds=gaDMTZbYuZDflZnzbceqTYDCnWbznhxClhbfxYncxcupeXmHsMeflZNvmWBf',
  75. 'priority': 'u=1, i',
  76. 'referer':
  77. 'https://mobile.yangkeduo.com/search_result.html?search_key=%E5%8D%A1%E5%A5%87%E5%B0%94&search_type=goods&source=index&options=1&search_met_track=manual&refer_page_el_sn=99884&refer_page_name=psnl_verification&refer_page_id=10390_1719054832104_uglp9uac0t&refer_page_sn=10390&page_id=10015_1719053308384_qv1mgz2pki&is_back=&bsch_is_search_mall=&bsch_show_active_page=&flip=0%3B0%3B0%3B0%3B726dcdf4-a6ae-210a-6afa-e4f679122426%3B%2F40%3B0%3B0%3Bd2d149c9d280203c602fa6a747caa3c2&sort_type=default&price_index=-1&filter=&opt_tag_name=&brand_tab_filter=&list_id=x8v4ig72bl',
  78. 'sec-ch-ua':
  79. '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
  80. 'sec-ch-ua-mobile': '?0',
  81. 'sec-ch-ua-platform': '"macOS"',
  82. 'sec-fetch-dest': 'empty',
  83. 'sec-fetch-mode': 'cors',
  84. 'sec-fetch-site': 'same-origin',
  85. 'user-agent':
  86. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
  87. 'verifyauthtoken': 'lqZp2RtQSqDimdHPpxljpA49eabc468299e2d76'
  88. }
  89. # 自定义异常类
  90. class ConversionError(Exception):
  91. pass
  92. def extract_data_from_html(content):
  93. # 使用 BeautifulSoup 解析 HTML 结果
  94. soup = BeautifulSoup(content, "html.parser")
  95. # 找到 body 标签
  96. body_tag = soup.find("body")
  97. # 在 body 标签内部找到第一级的 script 标签
  98. first_level_scripts = body_tag.find_all("script", recursive=False)
  99. # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
  100. for script_tag in first_level_scripts:
  101. script_content = script_tag.string
  102. compressed_string = re.sub(r"\n", "", script_content)
  103. if compressed_string:
  104. # 使用正则表达式提取 window.rawData 赋值语句
  105. match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
  106. compressed_string)
  107. if match:
  108. raw_data_value = match.group(1)
  109. # 尝试使用 json.loads() 将值转换为 Python 字典对象
  110. try:
  111. raw_data = json.loads(raw_data_value)
  112. except (ValueError, TypeError) as e:
  113. # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
  114. raw_data = eval(raw_data_value)
  115. print(f"Error converting value : {e}")
  116. raise ConversionError(f"Error converting value : {e}")
  117. return raw_data
  118. return None
  119. def write_stores_data_to_excel(data):
  120. if data:
  121. try:
  122. stores = data['stores']['store']['data']['ssrListData']['list']
  123. searchKey = data['stores']['store']['data']['ssrListData'][
  124. 'searchKey']
  125. except (KeyError, TypeError) as e:
  126. print(f"write_stores_data_to_excel: Error parse value : {e}")
  127. stores = None
  128. else:
  129. print("No data to write to Excel.")
  130. stores = None
  131. return stores
  132. def write_goods_data_to_excel(data, search_key):
  133. if data:
  134. try:
  135. goods = []
  136. items = data['items']
  137. for item in items:
  138. goods_model = item['item_data']['goods_model']
  139. goods_id = goods_model['goods_id']
  140. goods_name = goods_model['goods_name']
  141. link_url = goods_model['link_url']
  142. print(goods_name)
  143. if search_key in goods_name:
  144. good = {
  145. "goods_id": goods_id,
  146. "goods_name": goods_name,
  147. "link_url": link_url
  148. }
  149. # 将 JSON 对象添加到 JSON 数组中
  150. goods.append(good)
  151. except (KeyError, TypeError) as e:
  152. print(f"write_goods_data_to_excel: Error parse value : {e}")
  153. else:
  154. goods = None
  155. print("No data to write to Excel.")
  156. return goods
  157. if __name__ == "__main__":
  158. try:
  159. search_key = '卡奇尔'
  160. output_file = f"output_{search_key}.xlsx"
  161. combined_list = []
  162. for i in range(10):
  163. search_params['page'] = [i]
  164. search_params['q'] = [search_key]
  165. response = requests.request("GET",
  166. search_url,
  167. headers=search_headers,
  168. params=search_params)
  169. if response.status_code == 200:
  170. data = response.json()
  171. goods = write_goods_data_to_excel(data, search_key)
  172. if goods:
  173. combined_list.extend(goods)
  174. if len(combined_list) > 50:
  175. break
  176. else:
  177. # 请求失败,处理错误
  178. print(
  179. f"API Request failed with status code: {response.status_code}"
  180. )
  181. break
  182. if len(combined_list) != 0:
  183. df = pd.DataFrame(combined_list)
  184. # 将 DataFrame 写入 Excel 文件
  185. df.to_excel(output_file, index=False, engine='xlsxwriter')
  186. else:
  187. print("No data found in the JSON file.")
  188. except ConversionError as e:
  189. print(e)
  190. except Exception as e:
  191. print(f"An error occurred: {e}")