10 bulan lalu · 9b8055ff8e
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,162 @@
 
																+# Byte-compiled / optimized / DLL files
															
 
																+__pycache__/
															
 
																+*.py[cod]
															
 
																+*$py.class
															
 
																+
															
 
																+# C extensions
															
 
																+*.so
															
 
																+
															
 
																+# Distribution / packaging
															
 
																+.Python
															
 
																+build/
															
 
																+develop-eggs/
															
 
																+dist/
															
 
																+downloads/
															
 
																+eggs/
															
 
																+.eggs/
															
 
																+lib/
															
 
																+lib64/
															
 
																+parts/
															
 
																+sdist/
															
 
																+var/
															
 
																+wheels/
															
 
																+share/python-wheels/
															
 
																+*.egg-info/
															
 
																+.installed.cfg
															
 
																+*.egg
															
 
																+MANIFEST
															
 
																+
															
 
																+# PyInstaller
															
 
																+#  Usually these files are written by a python script from a template
															
 
																+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
															
 
																+*.manifest
															
 
																+*.spec
															
 
																+
															
 
																+# Installer logs
															
 
																+pip-log.txt
															
 
																+pip-delete-this-directory.txt
															
 
																+
															
 
																+# Unit test / coverage reports
															
 
																+htmlcov/
															
 
																+.tox/
															
 
																+.nox/
															
 
																+.coverage
															
 
																+.coverage.*
															
 
																+.cache
															
 
																+nosetests.xml
															
 
																+coverage.xml
															
 
																+*.cover
															
 
																+*.py,cover
															
 
																+.hypothesis/
															
 
																+.pytest_cache/
															
 
																+cover/
															
 
																+
															
 
																+# Translations
															
 
																+*.mo
															
 
																+*.pot
															
 
																+
															
 
																+# Django stuff:
															
 
																+*.log
															
 
																+local_settings.py
															
 
																+db.sqlite3
															
 
																+db.sqlite3-journal
															
 
																+
															
 
																+# Flask stuff:
															
 
																+instance/
															
 
																+.webassets-cache
															
 
																+
															
 
																+# Scrapy stuff:
															
 
																+.scrapy
															
 
																+
															
 
																+# Sphinx documentation
															
 
																+docs/_build/
															
 
																+
															
 
																+# PyBuilder
															
 
																+.pybuilder/
															
 
																+target/
															
 
																+
															
 
																+# Jupyter Notebook
															
 
																+.ipynb_checkpoints
															
 
																+
															
 
																+# IPython
															
 
																+profile_default/
															
 
																+ipython_config.py
															
 
																+
															
 
																+# pyenv
															
 
																+#   For a library or package, you might want to ignore these files since the code is
															
 
																+#   intended to run in multiple environments; otherwise, check them in:
															
 
																+# .python-version
															
 
																+
															
 
																+# pipenv
															
 
																+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
															
 
																+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
															
 
																+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
															
 
																+#   install all needed dependencies.
															
 
																+#Pipfile.lock
															
 
																+
															
 
																+# poetry
															
 
																+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
															
 
																+#   This is especially recommended for binary packages to ensure reproducibility, and is more
															
 
																+#   commonly ignored for libraries.
															
 
																+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
															
 
																+#poetry.lock
															
 
																+
															
 
																+# pdm
															
 
																+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
															
 
																+#pdm.lock
															
 
																+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
															
 
																+#   in version control.
															
 
																+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
															
 
																+.pdm.toml
															
 
																+.pdm-python
															
 
																+.pdm-build/
															
 
																+
															
 
																+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
															
 
																+__pypackages__/
															
 
																+
															
 
																+# Celery stuff
															
 
																+celerybeat-schedule
															
 
																+celerybeat.pid
															
 
																+
															
 
																+# SageMath parsed files
															
 
																+*.sage.py
															
 
																+
															
 
																+# Environments
															
 
																+.env
															
 
																+.venv
															
 
																+env/
															
 
																+venv/
															
 
																+ENV/
															
 
																+env.bak/
															
 
																+venv.bak/
															
 
																+
															
 
																+# Spyder project settings
															
 
																+.spyderproject
															
 
																+.spyproject
															
 
																+
															
 
																+# Rope project settings
															
 
																+.ropeproject
															
 
																+
															
 
																+# mkdocs documentation
															
 
																+/site
															
 
																+
															
 
																+# mypy
															
 
																+.mypy_cache/
															
 
																+.dmypy.json
															
 
																+dmypy.json
															
 
																+
															
 
																+# Pyre type checker
															
 
																+.pyre/
															
 
																+
															
 
																+# pytype static type analyzer
															
 
																+.pytype/
															
 
																+
															
 
																+# Cython debug symbols
															
 
																+cython_debug/
															
 
																+
															
 
																+# PyCharm
															
 
																+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
															
 
																+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
															
 
																+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
															
 
																+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
															
 
																+#.idea/
															
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
 
																+# kaqier
															
 
																+
															
 
																+PDD 商品数据解析
															
 
																+
															
 
																+## 开发与测试
															
 
																+
															
 
																+1、创建虚拟环境
															
 
																+
															
 
																+```
															
 
																+python3 -m venv venv # 创建环境
															
 
																+. venv/bin/activate # 激活环境
															
 
																+deactivate # 退出环境
															
 
																+
															
 
																+```
															
 
																+
															
 
																+2、安装依赖
															
 
																+
															
 
																+```
															
 
																+pip3 install -r requirements.txt
															
 
																+```
															
 
																+
															
 
																+3、导出依赖
															
 
																+
															
 
																+```
															
 
																+pip3 freeze > requirements.txt
															
 
																+```
															
 
																+
															
 
																+4、运行与测试
															
 
																+
															
 
																+```
															
 
																+python3 main.py
															
 
																+```
															
--- a/data.json
+++ b/data.json
--- a/main.py
+++ b/main.py
@@ -0,0 +1,128 @@
 
																+import requests
															
 
																+from bs4 import BeautifulSoup
															
 
																+import json
															
 
																+import re
															
 
																+import pandas as pd
															
 
																+
															
 
																+# 构建 URL
															
 
																+url = "https://mobile.yangkeduo.com/search_result.html"
															
 
																+params = {
															
 
																+    "search_key": "卡奇尔",
															
 
																+    "search_type": "goods",
															
 
																+    "source": "index",
															
 
																+    "options": 1,
															
 
																+    "search_met_track": "manual",
															
 
																+    "refer_page_el_sn": 99885,
															
 
																+    "refer_page_name": "psnl_verification",
															
 
																+    "refer_page_id": "10390_1719041565192_kvy50ivy6o",
															
 
																+    "refer_page_sn": 10390
															
 
																+}
															
 
																+payload = {}
															
 
																+headers = {
															
 
																+    'accept':
															
 
																+    'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
															
 
																+    'accept-language':
															
 
																+    'zh-CN,zh;q=0.9',
															
 
																+    'cache-control':
															
 
																+    'max-age=0',
															
 
																+    'cookie':
															
 
																+    'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLLNOQonbnLInGENOaiEionoNLiNOIotILGmynNOILtGLPQmNPoNmiOoQNo; pdd_vds=gaLLNOQonbnOGQaEGbiIPyiaEatOiELtGtiELILONnIInGmoNPGtmmnINEiP',
															
 
																+    'priority':
															
 
																+    'u=0, i',
															
 
																+    'sec-ch-ua':
															
 
																+    '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
															
 
																+    'sec-ch-ua-mobile':
															
 
																+    '?0',
															
 
																+    'sec-ch-ua-platform':
															
 
																+    '"macOS"',
															
 
																+    'sec-fetch-dest':
															
 
																+    'document',
															
 
																+    'sec-fetch-mode':
															
 
																+    'navigate',
															
 
																+    'sec-fetch-site':
															
 
																+    'none',
															
 
																+    'sec-fetch-user':
															
 
																+    '?1',
															
 
																+    'upgrade-insecure-requests':
															
 
																+    '1',
															
 
																+    'user-agent':
															
 
																+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
															
 
																+}
															
 
																+
															
 
																+
															
 
																+# 自定义异常类
															
 
																+class ConversionError(Exception):
															
 
																+    pass
															
 
																+
															
 
																+
															
 
																+def extract_data_from_html():
															
 
																+    response = requests.request("GET", url, headers=headers, params=params)
															
 
																+
															
 
																+    # 使用 BeautifulSoup 解析 HTML 结果
															
 
																+    soup = BeautifulSoup(response.content, "html.parser")
															
 
																+
															
 
																+    # 找到 body 标签
															
 
																+    body_tag = soup.find("body")
															
 
																+
															
 
																+    # 在 body 标签内部找到第一级的 script 标签
															
 
																+    first_level_scripts = body_tag.find_all("script", recursive=False)
															
 
																+
															
 
																+    # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
															
 
																+    for script_tag in first_level_scripts:
															
 
																+        script_content = script_tag.string
															
 
																+        compressed_string = re.sub(r"\n", "", script_content)
															
 
																+        if compressed_string:
															
 
																+            # 使用正则表达式提取 window.rawData 赋值语句
															
 
																+            match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
															
 
																+                              compressed_string)
															
 
																+            if match:
															
 
																+                raw_data_value = match.group(1)
															
 
																+                print(raw_data_value)
															
 
																+                # 尝试使用 json.loads() 将值转换为 Python 字典对象
															
 
																+                try:
															
 
																+                    raw_data = json.loads(raw_data_value)
															
 
																+                except (ValueError, TypeError) as e:
															
 
																+                    # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
															
 
																+                    raw_data = eval(raw_data_value)
															
 
																+                    print(f"Error converting value : {e}")
															
 
																+                    raise ConversionError(f"Error converting value : {e}")
															
 
																+
															
 
																+                return raw_data
															
 
																+    return None
															
 
																+
															
 
																+
															
 
																+def write_data_to_excel(data, columns=None):
															
 
																+    if data:
															
 
																+        try:
															
 
																+            list = data['stores']['store']['data']['ssrListData']['list']
															
 
																+            searchKey = data['stores']['store']['data']['ssrListData'][
															
 
																+                'searchKey']
															
 
																+        except (KeyError, TypeError) as e:
															
 
																+            print(f"Error parse value : {e}")
															
 
																+            list = None
															
 
																+
															
 
																+        if list:
															
 
																+            df = pd.DataFrame(list)
															
 
																+
															
 
																+            # 如果用户指定了列名,则使用指定的列名
															
 
																+            if columns:
															
 
																+                df = df[columns]
															
 
																+
															
 
																+            # 将 DataFrame 写入 Excel 文件
															
 
																+            output_file = f"output_{searchKey}.xlsx"
															
 
																+            df.to_excel(output_file, index=False, engine='xlsxwriter')
															
 
																+        else:
															
 
																+            print("No data found in the JSON file.")
															
 
																+    else:
															
 
																+        print("No data to write to Excel.")
															
 
																+
															
 
																+
															
 
																+if __name__ == "__main__":
															
 
																+    try:
															
 
																+        raw_data = extract_data_from_html()
															
 
																+        write_data_to_excel(
															
 
																+            raw_data, columns=['goodsID', 'goodsName', 'goodsName', 'linkURL'])
															
 
																+    except ConversionError as e:
															
 
																+        print(e)
															
 
																+    except Exception as e:
															
 
																+        print(f"An error occurred: {e}")
															
--- a/output_卡奇尔.xlsx
+++ b/output_卡奇尔.xlsx
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,18 @@
 
																+beautifulsoup4==4.12.3
															
 
																+bs4==0.0.2
															
 
																+certifi==2024.6.2
															
 
																+charset-normalizer==3.3.2
															
 
																+et-xmlfile==1.1.0
															
 
																+idna==3.7
															
 
																+lxml==5.2.2
															
 
																+numpy==2.0.0
															
 
																+openpyxl==3.1.4
															
 
																+pandas==2.2.2
															
 
																+python-dateutil==2.9.0.post0
															
 
																+pytz==2024.1
															
 
																+requests==2.32.3
															
 
																+six==1.16.0
															
 
																+soupsieve==2.5
															
 
																+tzdata==2024.1
															
 
																+urllib3==2.2.2
															
 
																+XlsxWriter==3.2.0
															
--- a/test.py
+++ b/test.py
@@ -0,0 +1,16 @@
 
																+import json
															
 
																+import pandas as pd
															
 
																+
															
 
																+with open('data.json', 'r') as f:
															
 
																+    data = json.load(f)
															
 
																+
															
 
																+try:
															
 
																+    list = data['stores']['store']['data']['ssrListData']['list']
															
 
																+except (KeyError, TypeError) as e:
															
 
																+    print(f"Error parse value : {e}")
															
 
																+    list = None
															
 
																+
															
 
																+df = pd.DataFrame(list)
															
 
																+
															
 
																+# 将 DataFrame 写入 Excel 文件
															
 
																+df.to_excel('output.xlsx', index=False, engine='xlsxwriter')