Prechádzať zdrojové kódy

use get_recommender_pages instead of get_related_pages

Ben 1 rok pred
rodič
commit
ea5d771230
2 zmenil súbory, kde vykonal 18 pridanie a 4 odobranie
  1. 6 1
      README.md
  2. 12 3
      spider.py

+ 6 - 1
README.md

@@ -26,12 +26,17 @@ curl --location 'https://api.semanticscholar.org/graph/v1/paper/61822dc4ea365e14
 --header 'x-api-key: B4YUQrO6w07Zyx9LN8V3p5Lg0WrrGDK520fWJfYD'
 ```
 
-相关论文 `related-papers`:
+~~相关论文 `related-papers`:~~(web接口存在人机验证)
 ```bash
 curl --location 'https://www.semanticscholar.org/api/1/paper/61822dc4ea365e1499fbdae7958aa317ad78f39f/related-papers?limit=15&recommenderType=relatedPapers' \
 --header 'Cookie: tid=rBIABmR91dK7TwAJJIRdAg=='
 ```
 
+推荐论文
+```bash
+https://api.semanticscholar.org/recommendations/v1/papers/forpaper/61822dc4ea365e1499fbdae7958aa317ad78f39f?fields=url,abstract,authors
+```
+
 ### 数据字典
 
 ```json

+ 12 - 3
spider.py

@@ -117,7 +117,7 @@ def worker(q):
                 filter = {'corpusId': corpus_id}
                 update = {'$set': data}
                 result = papers_data.update_one(filter, update, upsert=True)
-                # mark_data_as_consumed(corpus_id)
+                mark_data_as_consumed(corpus_id)
                 print(result.upserted_id, "inserted successfully")
         except Exception as error:
             # handle the exception
@@ -148,6 +148,7 @@ def get_references(paper_id):
     return list(edge['citedPaper'] for edge in edges)
 
 
+# 接口存在人机验证
 def get_related_pages(paper_id):
     rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
                        headers={'x-api-key': S2_API_KEY},
@@ -156,6 +157,14 @@ def get_related_pages(paper_id):
     return rsp.json()['papers']
 
 
+def get_recommender_pages(paper_id):
+    rsp = requests.get(url=f'https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS2})
+    rsp.raise_for_status()
+    return rsp.json()['recommendedPapers']
+
+
 def get_citation_edges(**req_kwargs):
     """This helps with API endpoints that involve paging."""
     page_size = 1000
@@ -182,7 +191,7 @@ def fetch_data(paper_id):
     # print(paper)
     data['citations'] = get_citations(paper_id)
     data['references'] = get_references(paper_id)
-    data['relatedPages'] = get_related_pages(paper_id)
+    data['recommenderPages'] = get_recommender_pages(paper_id)
 
     return data if isinstance(data, dict) else None
 
@@ -193,7 +202,7 @@ def onSigInt(signo, frame):
 
 if __name__ == "__main__":
     # 主进程退出信号
-    signal.signal(signal.SIGINT, onSigInt)
+    # signal.signal(signal.SIGINT, onSigInt)
 
     parser = argparse.ArgumentParser(description="Crawl data from URLs")
     parser.add_argument(