Quellcode durchsuchen

add a web scraping APIs and improve data formatting

Ben vor 1 Jahr
Ursprung
Commit
6f26b448a2
2 geänderte Dateien mit 2742 neuen und 16 gelöschten Zeilen
  1. 2660 0
      paper_spider.papers_data.json
  2. 82 16
      spider.py

Datei-Diff unterdrückt, da er zu groß ist
+ 2660 - 0
paper_spider.papers_data.json


+ 82 - 16
spider.py

@@ -1,10 +1,18 @@
+import requests
+import os
 import signal
 import argparse
 import json
 import pymongo
 from queue import Queue
 from threading import Thread
-from urllib.request import urlopen
+from urllib.parse import urlparse
+
+# S2_API_KEY = os.getenv('S2_API_KEY')
+S2_API_KEY = 'b4YUQrO6w07Zyx9LN8V3p5Lg0WrrGDK520fWJfYd'
+QUERY_FIELDS1 = 'paperId,corpusId,title,authors,year,url,tldr,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
+QUERY_FIELDS2 = 'paperId,corpusId,title,authors,year,url,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
+QUERY_FIELDS3 = 'paperId,corpusId,title,authors'
 
 # 读取配置文件中的数据库参数
 with open("config.json", "r") as f:
@@ -87,8 +95,9 @@ def crawl_data():
         t.join()
 
 
-def mark_data_as_consumed(id):
-    papers.update_one({'_id': id}, {'$set': {'consumed': True}})
+def mark_data_as_consumed(corpus_id):
+    result = papers.update_one({'corpusid': corpus_id}, {
+                               '$set': {'consumed': True}})
 
 
 def worker(q):
@@ -96,16 +105,20 @@ def worker(q):
         item = q.get()
         if item is None:
             break
-        print('crawling data: {}'.format(item[0]))
+        url = urlparse(item[0]).path
+        paper_id = url.split('/')[-1]
+        corpus_id = item[1]
+        print('crawling {} data: {}'.format(corpus_id, url))
+
         try:
-            data = fetch_data(item[0])
-            data = {
-                'url': item[0],
-                'corpusid': item[1]
-            }
+            data = fetch_data(paper_id)
             if data is not None:
-                papers_data.insert_one(data)
-                mark_data_as_consumed(item[1])
+                # papers_data.insert_one(data)
+                filter = {'corpusId': corpus_id}
+                update = {'$set': data}
+                result = papers_data.update_one(filter, update, upsert=True)
+                # mark_data_as_consumed(corpus_id)
+                print(result.upserted_id, "inserted successfully")
         except Exception as error:
             # handle the exception
             print("An exception occurred:", error)
@@ -113,11 +126,64 @@ def worker(q):
             q.task_done()
 
 
-def fetch_data(url):
-    response = urlopen(url)
-    # time.sleep(5)
-    data = None
-    # data = json.load(response)
+def get_paper(paper_id):
+    rsp = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS1})
+    rsp.raise_for_status()
+    return rsp.json()
+
+
+def get_citations(paper_id):
+    edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations',
+                               headers={'x-api-key': S2_API_KEY},
+                               params={'fields': QUERY_FIELDS2})
+    return list(edge['citingPaper'] for edge in edges)
+
+
+def get_references(paper_id):
+    edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references',
+                               headers={'x-api-key': S2_API_KEY},
+                               params={'fields': QUERY_FIELDS2})
+    return list(edge['citedPaper'] for edge in edges)
+
+
+def get_related_pages(paper_id):
+    rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS3})
+    rsp.raise_for_status()
+    return rsp.json()['papers']
+
+
+def get_citation_edges(**req_kwargs):
+    """This helps with API endpoints that involve paging."""
+    page_size = 1000
+    offset = 0
+    while True:
+        req_kwargs.setdefault('params', dict())
+        req_kwargs['params']['limit'] = page_size
+        req_kwargs['params']['offset'] = offset
+        rsp = requests.get(**req_kwargs)
+        rsp.raise_for_status()
+
+        page = rsp.json()["data"]
+        for element in page:
+            yield element
+
+        if len(page) < page_size:
+            break  # no more pages
+        offset += page_size
+
+
+def fetch_data(paper_id):
+    print("fetching data:", paper_id)
+    data = get_paper(paper_id)
+    # print(paper)
+    data['citations'] = get_citations(paper_id)
+    data['references'] = get_references(paper_id)
+    data['relatedPages'] = get_related_pages(paper_id)
+
     return data if isinstance(data, dict) else None