2 Commits 6f26b448a2 ... f85a75602d

Auteur SHA1 Bericht Datum
  Ben f85a75602d fix papers_data.create_index 1 jaar geleden
  Ben ea5d771230 use get_recommender_pages instead of get_related_pages 1 jaar geleden
2 gewijzigde bestanden met toevoegingen van 23 en 7 verwijderingen
  1. 6 1
      README.md
  2. 17 6
      spider.py

+ 6 - 1
README.md

@@ -26,12 +26,17 @@ curl --location 'https://api.semanticscholar.org/graph/v1/paper/61822dc4ea365e14
 --header 'x-api-key: B4YUQrO6w07Zyx9LN8V3p5Lg0WrrGDK520fWJfYD'
 ```
 
-相关论文 `related-papers`:
+~~相关论文 `related-papers`:~~(web接口存在人机验证)
 ```bash
 curl --location 'https://www.semanticscholar.org/api/1/paper/61822dc4ea365e1499fbdae7958aa317ad78f39f/related-papers?limit=15&recommenderType=relatedPapers' \
 --header 'Cookie: tid=rBIABmR91dK7TwAJJIRdAg=='
 ```
 
+推荐论文
+```bash
+https://api.semanticscholar.org/recommendations/v1/papers/forpaper/61822dc4ea365e1499fbdae7958aa317ad78f39f?fields=url,abstract,authors
+```
+
 ### 数据字典
 
 ```json

+ 17 - 6
spider.py

@@ -58,7 +58,7 @@ def add_paper(file_path):
 
 
 def crawl_data():
-    papers_data.create_index("corpusid", unique=True)
+    papers_data.create_index("corpusId", unique=True)
 
     # 创建任务队列和线程
     q = Queue()
@@ -73,12 +73,14 @@ def crawl_data():
 
     # 从数据库中读取 URL,加入任务队列
     for data in papers.find():
+        url = data["url"]
+        corpusid = data["corpusid"]
         if 'consumed' in data.keys() and data['consumed'] is True:
+            print(corpusid, "already inserted")
             continue
         # print(data['corpusid'])
         # print(data['url'])
-        url = data["url"]
-        corpusid = data["corpusid"]
+
         q.put((url, corpusid))
         break
 
@@ -117,7 +119,7 @@ def worker(q):
                 filter = {'corpusId': corpus_id}
                 update = {'$set': data}
                 result = papers_data.update_one(filter, update, upsert=True)
-                # mark_data_as_consumed(corpus_id)
+                mark_data_as_consumed(corpus_id)
                 print(result.upserted_id, "inserted successfully")
         except Exception as error:
             # handle the exception
@@ -148,6 +150,7 @@ def get_references(paper_id):
     return list(edge['citedPaper'] for edge in edges)
 
 
+# 接口存在人机验证
 def get_related_pages(paper_id):
     rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
                        headers={'x-api-key': S2_API_KEY},
@@ -156,6 +159,14 @@ def get_related_pages(paper_id):
     return rsp.json()['papers']
 
 
+def get_recommender_pages(paper_id):
+    rsp = requests.get(url=f'https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS2})
+    rsp.raise_for_status()
+    return rsp.json()['recommendedPapers']
+
+
 def get_citation_edges(**req_kwargs):
     """This helps with API endpoints that involve paging."""
     page_size = 1000
@@ -182,7 +193,7 @@ def fetch_data(paper_id):
     # print(paper)
     data['citations'] = get_citations(paper_id)
     data['references'] = get_references(paper_id)
-    data['relatedPages'] = get_related_pages(paper_id)
+    data['recommenderPages'] = get_recommender_pages(paper_id)
 
     return data if isinstance(data, dict) else None
 
@@ -193,7 +204,7 @@ def onSigInt(signo, frame):
 
 if __name__ == "__main__":
     # 主进程退出信号
-    signal.signal(signal.SIGINT, onSigInt)
+    # signal.signal(signal.SIGINT, onSigInt)
 
     parser = argparse.ArgumentParser(description="Crawl data from URLs")
     parser.add_argument(