6 月之前 · 9b8055ff8e
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,162 @@
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+share/python-wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+MANIFEST
			
 
				+
			
 
				+# PyInstaller
			
 
				+#  Usually these files are written by a python script from a template
			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Installer logs
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.nox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*.cover
			
 
				+*.py,cover
			
 
				+.hypothesis/
			
 
				+.pytest_cache/
			
 
				+cover/
			
 
				+
			
 
				+# Translations
			
 
				+*.mo
			
 
				+*.pot
			
 
				+
			
 
				+# Django stuff:
			
 
				+*.log
			
 
				+local_settings.py
			
 
				+db.sqlite3
			
 
				+db.sqlite3-journal
			
 
				+
			
 
				+# Flask stuff:
			
 
				+instance/
			
 
				+.webassets-cache
			
 
				+
			
 
				+# Scrapy stuff:
			
 
				+.scrapy
			
 
				+
			
 
				+# Sphinx documentation
			
 
				+docs/_build/
			
 
				+
			
 
				+# PyBuilder
			
 
				+.pybuilder/
			
 
				+target/
			
 
				+
			
 
				+# Jupyter Notebook
			
 
				+.ipynb_checkpoints
			
 
				+
			
 
				+# IPython
			
 
				+profile_default/
			
 
				+ipython_config.py
			
 
				+
			
 
				+# pyenv
			
 
				+#   For a library or package, you might want to ignore these files since the code is
			
 
				+#   intended to run in multiple environments; otherwise, check them in:
			
 
				+# .python-version
			
 
				+
			
 
				+# pipenv
			
 
				+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
			
 
				+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
			
 
				+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
			
 
				+#   install all needed dependencies.
			
 
				+#Pipfile.lock
			
 
				+
			
 
				+# poetry
			
 
				+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
			
 
				+#   This is especially recommended for binary packages to ensure reproducibility, and is more
			
 
				+#   commonly ignored for libraries.
			
 
				+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
			
 
				+#poetry.lock
			
 
				+
			
 
				+# pdm
			
 
				+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
			
 
				+#pdm.lock
			
 
				+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
			
 
				+#   in version control.
			
 
				+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
			
 
				+.pdm.toml
			
 
				+.pdm-python
			
 
				+.pdm-build/
			
 
				+
			
 
				+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
			
 
				+__pypackages__/
			
 
				+
			
 
				+# Celery stuff
			
 
				+celerybeat-schedule
			
 
				+celerybeat.pid
			
 
				+
			
 
				+# SageMath parsed files
			
 
				+*.sage.py
			
 
				+
			
 
				+# Environments
			
 
				+.env
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
 
				+
			
 
				+# Spyder project settings
			
 
				+.spyderproject
			
 
				+.spyproject
			
 
				+
			
 
				+# Rope project settings
			
 
				+.ropeproject
			
 
				+
			
 
				+# mkdocs documentation
			
 
				+/site
			
 
				+
			
 
				+# mypy
			
 
				+.mypy_cache/
			
 
				+.dmypy.json
			
 
				+dmypy.json
			
 
				+
			
 
				+# Pyre type checker
			
 
				+.pyre/
			
 
				+
			
 
				+# pytype static type analyzer
			
 
				+.pytype/
			
 
				+
			
 
				+# Cython debug symbols
			
 
				+cython_debug/
			
 
				+
			
 
				+# PyCharm
			
 
				+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
			
 
				+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
			
 
				+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
			
 
				+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
			
 
				+#.idea/
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
 
				+# kaqier
			
 
				+
			
 
				+PDD 商品数据解析
			
 
				+
			
 
				+## 开发与测试
			
 
				+
			
 
				+1、创建虚拟环境
			
 
				+
			
 
				+```
			
 
				+python3 -m venv venv # 创建环境
			
 
				+. venv/bin/activate # 激活环境
			
 
				+deactivate # 退出环境
			
 
				+
			
 
				+```
			
 
				+
			
 
				+2、安装依赖
			
 
				+
			
 
				+```
			
 
				+pip3 install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+3、导出依赖
			
 
				+
			
 
				+```
			
 
				+pip3 freeze > requirements.txt
			
 
				+```
			
 
				+
			
 
				+4、运行与测试
			
 
				+
			
 
				+```
			
 
				+python3 main.py
			
 
				+```
			
--- a/data.json
+++ b/data.json
--- a/main.py
+++ b/main.py
@@ -0,0 +1,128 @@
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+import json
			
 
				+import re
			
 
				+import pandas as pd
			
 
				+
			
 
				+# 构建 URL
			
 
				+url = "https://mobile.yangkeduo.com/search_result.html"
			
 
				+params = {
			
 
				+    "search_key": "卡奇尔",
			
 
				+    "search_type": "goods",
			
 
				+    "source": "index",
			
 
				+    "options": 1,
			
 
				+    "search_met_track": "manual",
			
 
				+    "refer_page_el_sn": 99885,
			
 
				+    "refer_page_name": "psnl_verification",
			
 
				+    "refer_page_id": "10390_1719041565192_kvy50ivy6o",
			
 
				+    "refer_page_sn": 10390
			
 
				+}
			
 
				+payload = {}
			
 
				+headers = {
			
 
				+    'accept':
			
 
				+    'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
			
 
				+    'accept-language':
			
 
				+    'zh-CN,zh;q=0.9',
			
 
				+    'cache-control':
			
 
				+    'max-age=0',
			
 
				+    'cookie':
			
 
				+    'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLLNOQonbnLInGENOaiEionoNLiNOIotILGmynNOILtGLPQmNPoNmiOoQNo; pdd_vds=gaLLNOQonbnOGQaEGbiIPyiaEatOiELtGtiELILONnIInGmoNPGtmmnINEiP',
			
 
				+    'priority':
			
 
				+    'u=0, i',
			
 
				+    'sec-ch-ua':
			
 
				+    '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
			
 
				+    'sec-ch-ua-mobile':
			
 
				+    '?0',
			
 
				+    'sec-ch-ua-platform':
			
 
				+    '"macOS"',
			
 
				+    'sec-fetch-dest':
			
 
				+    'document',
			
 
				+    'sec-fetch-mode':
			
 
				+    'navigate',
			
 
				+    'sec-fetch-site':
			
 
				+    'none',
			
 
				+    'sec-fetch-user':
			
 
				+    '?1',
			
 
				+    'upgrade-insecure-requests':
			
 
				+    '1',
			
 
				+    'user-agent':
			
 
				+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# 自定义异常类
			
 
				+class ConversionError(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+def extract_data_from_html():
			
 
				+    response = requests.request("GET", url, headers=headers, params=params)
			
 
				+
			
 
				+    # 使用 BeautifulSoup 解析 HTML 结果
			
 
				+    soup = BeautifulSoup(response.content, "html.parser")
			
 
				+
			
 
				+    # 找到 body 标签
			
 
				+    body_tag = soup.find("body")
			
 
				+
			
 
				+    # 在 body 标签内部找到第一级的 script 标签
			
 
				+    first_level_scripts = body_tag.find_all("script", recursive=False)
			
 
				+
			
 
				+    # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
			
 
				+    for script_tag in first_level_scripts:
			
 
				+        script_content = script_tag.string
			
 
				+        compressed_string = re.sub(r"\n", "", script_content)
			
 
				+        if compressed_string:
			
 
				+            # 使用正则表达式提取 window.rawData 赋值语句
			
 
				+            match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
			
 
				+                              compressed_string)
			
 
				+            if match:
			
 
				+                raw_data_value = match.group(1)
			
 
				+                print(raw_data_value)
			
 
				+                # 尝试使用 json.loads() 将值转换为 Python 字典对象
			
 
				+                try:
			
 
				+                    raw_data = json.loads(raw_data_value)
			
 
				+                except (ValueError, TypeError) as e:
			
 
				+                    # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
			
 
				+                    raw_data = eval(raw_data_value)
			
 
				+                    print(f"Error converting value : {e}")
			
 
				+                    raise ConversionError(f"Error converting value : {e}")
			
 
				+
			
 
				+                return raw_data
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def write_data_to_excel(data, columns=None):
			
 
				+    if data:
			
 
				+        try:
			
 
				+            list = data['stores']['store']['data']['ssrListData']['list']
			
 
				+            searchKey = data['stores']['store']['data']['ssrListData'][
			
 
				+                'searchKey']
			
 
				+        except (KeyError, TypeError) as e:
			
 
				+            print(f"Error parse value : {e}")
			
 
				+            list = None
			
 
				+
			
 
				+        if list:
			
 
				+            df = pd.DataFrame(list)
			
 
				+
			
 
				+            # 如果用户指定了列名,则使用指定的列名
			
 
				+            if columns:
			
 
				+                df = df[columns]
			
 
				+
			
 
				+            # 将 DataFrame 写入 Excel 文件
			
 
				+            output_file = f"output_{searchKey}.xlsx"
			
 
				+            df.to_excel(output_file, index=False, engine='xlsxwriter')
			
 
				+        else:
			
 
				+            print("No data found in the JSON file.")
			
 
				+    else:
			
 
				+        print("No data to write to Excel.")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    try:
			
 
				+        raw_data = extract_data_from_html()
			
 
				+        write_data_to_excel(
			
 
				+            raw_data, columns=['goodsID', 'goodsName', 'goodsName', 'linkURL'])
			
 
				+    except ConversionError as e:
			
 
				+        print(e)
			
 
				+    except Exception as e:
			
 
				+        print(f"An error occurred: {e}")
			
--- a/output_卡奇尔.xlsx
+++ b/output_卡奇尔.xlsx
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,18 @@
 
				+beautifulsoup4==4.12.3
			
 
				+bs4==0.0.2
			
 
				+certifi==2024.6.2
			
 
				+charset-normalizer==3.3.2
			
 
				+et-xmlfile==1.1.0
			
 
				+idna==3.7
			
 
				+lxml==5.2.2
			
 
				+numpy==2.0.0
			
 
				+openpyxl==3.1.4
			
 
				+pandas==2.2.2
			
 
				+python-dateutil==2.9.0.post0
			
 
				+pytz==2024.1
			
 
				+requests==2.32.3
			
 
				+six==1.16.0
			
 
				+soupsieve==2.5
			
 
				+tzdata==2024.1
			
 
				+urllib3==2.2.2
			
 
				+XlsxWriter==3.2.0
			
--- a/test.py
+++ b/test.py
@@ -0,0 +1,16 @@
 
				+import json
			
 
				+import pandas as pd
			
 
				+
			
 
				+with open('data.json', 'r') as f:
			
 
				+    data = json.load(f)
			
 
				+
			
 
				+try:
			
 
				+    list = data['stores']['store']['data']['ssrListData']['list']
			
 
				+except (KeyError, TypeError) as e:
			
 
				+    print(f"Error parse value : {e}")
			
 
				+    list = None
			
 
				+
			
 
				+df = pd.DataFrame(list)
			
 
				+
			
 
				+# 将 DataFrame 写入 Excel 文件
			
 
				+df.to_excel('output.xlsx', index=False, engine='xlsxwriter')