Ver código fonte

first commit

Ben 6 meses atrás
commit
9b8055ff8e
7 arquivos alterados com 357 adições e 0 exclusões
  1. 162 0
      .gitignore
  2. 32 0
      README.md
  3. 1 0
      data.json
  4. 128 0
      main.py
  5. BIN
      output_卡奇尔.xlsx
  6. 18 0
      requirements.txt
  7. 16 0
      test.py

+ 162 - 0
.gitignore

@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

+ 32 - 0
README.md

@@ -0,0 +1,32 @@
+# kaqier
+
+PDD 商品数据解析
+
+## 开发与测试
+
+1、创建虚拟环境
+
+```
+python3 -m venv venv # 创建环境
+. venv/bin/activate # 激活环境
+deactivate # 退出环境
+
+```
+
+2、安装依赖
+
+```
+pip3 install -r requirements.txt
+```
+
+3、导出依赖
+
+```
+pip3 freeze > requirements.txt
+```
+
+4、运行与测试
+
+```
+python3 main.py
+```

Diferenças do arquivo suprimidas por serem muito extensas
+ 1 - 0
data.json


+ 128 - 0
main.py

@@ -0,0 +1,128 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import re
+import pandas as pd
+
+# 构建 URL
+url = "https://mobile.yangkeduo.com/search_result.html"
+params = {
+    "search_key": "卡奇尔",
+    "search_type": "goods",
+    "source": "index",
+    "options": 1,
+    "search_met_track": "manual",
+    "refer_page_el_sn": 99885,
+    "refer_page_name": "psnl_verification",
+    "refer_page_id": "10390_1719041565192_kvy50ivy6o",
+    "refer_page_sn": 10390
+}
+payload = {}
+headers = {
+    'accept':
+    'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'accept-language':
+    'zh-CN,zh;q=0.9',
+    'cache-control':
+    'max-age=0',
+    'cookie':
+    'api_uid=CiHdtmZ2bOUdZABVWAe/Ag==; _nano_fp=Xpmalp9qnqgylpdJlT_h~uAFy_JoLAWdkdOx0hVt; webp=1; jrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; njrpl=RSwIv0e0mE9DfvQqFqfWBr1n5OMeNIQR; dilx=bSyz2efIuySKdkq3pYfqD; PDDAccessToken=7JQFYB6SD5FTLTZHX7ECKGBHS3X64WVSHFDD6WNSBRIG6HYMA7UA121b8be; pdd_user_id=6082723443128; pdd_user_uin=X6Q3CK6ATURUPGYQNQFRRKXTA4_GEXDA; pdd_vds=gaLLNOQonbnLInGENOaiEionoNLiNOIotILGmynNOILtGLPQmNPoNmiOoQNo; pdd_vds=gaLLNOQonbnOGQaEGbiIPyiaEatOiELtGtiELILONnIInGmoNPGtmmnINEiP',
+    'priority':
+    'u=0, i',
+    'sec-ch-ua':
+    '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
+    'sec-ch-ua-mobile':
+    '?0',
+    'sec-ch-ua-platform':
+    '"macOS"',
+    'sec-fetch-dest':
+    'document',
+    'sec-fetch-mode':
+    'navigate',
+    'sec-fetch-site':
+    'none',
+    'sec-fetch-user':
+    '?1',
+    'upgrade-insecure-requests':
+    '1',
+    'user-agent':
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+}
+
+
+# 自定义异常类
+class ConversionError(Exception):
+    pass
+
+
+def extract_data_from_html():
+    response = requests.request("GET", url, headers=headers, params=params)
+
+    # 使用 BeautifulSoup 解析 HTML 结果
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    # 找到 body 标签
+    body_tag = soup.find("body")
+
+    # 在 body 标签内部找到第一级的 script 标签
+    first_level_scripts = body_tag.find_all("script", recursive=False)
+
+    # 遍历第一级的 script 标签,并将其内容转换为 Python 字典
+    for script_tag in first_level_scripts:
+        script_content = script_tag.string
+        compressed_string = re.sub(r"\n", "", script_content)
+        if compressed_string:
+            # 使用正则表达式提取 window.rawData 赋值语句
+            match = re.search(r"window\.rawData\s*=\s*(\{.+?\});",
+                              compressed_string)
+            if match:
+                raw_data_value = match.group(1)
+                print(raw_data_value)
+                # 尝试使用 json.loads() 将值转换为 Python 字典对象
+                try:
+                    raw_data = json.loads(raw_data_value)
+                except (ValueError, TypeError) as e:
+                    # 如果 JSON 解析失败,则使用 eval() 函数尝试解析
+                    raw_data = eval(raw_data_value)
+                    print(f"Error converting value : {e}")
+                    raise ConversionError(f"Error converting value : {e}")
+
+                return raw_data
+    return None
+
+
+def write_data_to_excel(data, columns=None):
+    if data:
+        try:
+            list = data['stores']['store']['data']['ssrListData']['list']
+            searchKey = data['stores']['store']['data']['ssrListData'][
+                'searchKey']
+        except (KeyError, TypeError) as e:
+            print(f"Error parse value : {e}")
+            list = None
+
+        if list:
+            df = pd.DataFrame(list)
+
+            # 如果用户指定了列名,则使用指定的列名
+            if columns:
+                df = df[columns]
+
+            # 将 DataFrame 写入 Excel 文件
+            output_file = f"output_{searchKey}.xlsx"
+            df.to_excel(output_file, index=False, engine='xlsxwriter')
+        else:
+            print("No data found in the JSON file.")
+    else:
+        print("No data to write to Excel.")
+
+
+if __name__ == "__main__":
+    try:
+        raw_data = extract_data_from_html()
+        write_data_to_excel(
+            raw_data, columns=['goodsID', 'goodsName', 'goodsName', 'linkURL'])
+    except ConversionError as e:
+        print(e)
+    except Exception as e:
+        print(f"An error occurred: {e}")

BIN
output_卡奇尔.xlsx


+ 18 - 0
requirements.txt

@@ -0,0 +1,18 @@
+beautifulsoup4==4.12.3
+bs4==0.0.2
+certifi==2024.6.2
+charset-normalizer==3.3.2
+et-xmlfile==1.1.0
+idna==3.7
+lxml==5.2.2
+numpy==2.0.0
+openpyxl==3.1.4
+pandas==2.2.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+requests==2.32.3
+six==1.16.0
+soupsieve==2.5
+tzdata==2024.1
+urllib3==2.2.2
+XlsxWriter==3.2.0

+ 16 - 0
test.py

@@ -0,0 +1,16 @@
+import json
+import pandas as pd
+
+with open('data.json', 'r') as f:
+    data = json.load(f)
+
+try:
+    list = data['stores']['store']['data']['ssrListData']['list']
+except (KeyError, TypeError) as e:
+    print(f"Error parse value : {e}")
+    list = None
+
+df = pd.DataFrame(list)
+
+# 将 DataFrame 写入 Excel 文件
+df.to_excel('output.xlsx', index=False, engine='xlsxwriter')