Browse Source

add a web scraping APIs and improve data formatting

Ben 1 year ago
parent
commit
6f26b448a2
2 changed files with 2742 additions and 16 deletions
  1. 2660 0
      paper_spider.papers_data.json
  2. 82 16
      spider.py

File diff suppressed because it is too large
+ 2660 - 0
paper_spider.papers_data.json


+ 82 - 16
spider.py

@@ -1,10 +1,18 @@
+import requests
+import os
 import signal
 import signal
 import argparse
 import argparse
 import json
 import json
 import pymongo
 import pymongo
 from queue import Queue
 from queue import Queue
 from threading import Thread
 from threading import Thread
-from urllib.request import urlopen
+from urllib.parse import urlparse
+
+# S2_API_KEY = os.getenv('S2_API_KEY')
+S2_API_KEY = 'b4YUQrO6w07Zyx9LN8V3p5Lg0WrrGDK520fWJfYd'
+QUERY_FIELDS1 = 'paperId,corpusId,title,authors,year,url,tldr,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
+QUERY_FIELDS2 = 'paperId,corpusId,title,authors,year,url,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
+QUERY_FIELDS3 = 'paperId,corpusId,title,authors'
 
 
 # 读取配置文件中的数据库参数
 # 读取配置文件中的数据库参数
 with open("config.json", "r") as f:
 with open("config.json", "r") as f:
@@ -87,8 +95,9 @@ def crawl_data():
         t.join()
         t.join()
 
 
 
 
-def mark_data_as_consumed(id):
-    papers.update_one({'_id': id}, {'$set': {'consumed': True}})
+def mark_data_as_consumed(corpus_id):
+    result = papers.update_one({'corpusid': corpus_id}, {
+                               '$set': {'consumed': True}})
 
 
 
 
 def worker(q):
 def worker(q):
@@ -96,16 +105,20 @@ def worker(q):
         item = q.get()
         item = q.get()
         if item is None:
         if item is None:
             break
             break
-        print('crawling data: {}'.format(item[0]))
+        url = urlparse(item[0]).path
+        paper_id = url.split('/')[-1]
+        corpus_id = item[1]
+        print('crawling {} data: {}'.format(corpus_id, url))
+
         try:
         try:
-            data = fetch_data(item[0])
-            data = {
-                'url': item[0],
-                'corpusid': item[1]
-            }
+            data = fetch_data(paper_id)
             if data is not None:
             if data is not None:
-                papers_data.insert_one(data)
-                mark_data_as_consumed(item[1])
+                # papers_data.insert_one(data)
+                filter = {'corpusId': corpus_id}
+                update = {'$set': data}
+                result = papers_data.update_one(filter, update, upsert=True)
+                # mark_data_as_consumed(corpus_id)
+                print(result.upserted_id, "inserted successfully")
         except Exception as error:
         except Exception as error:
             # handle the exception
             # handle the exception
             print("An exception occurred:", error)
             print("An exception occurred:", error)
@@ -113,11 +126,64 @@ def worker(q):
             q.task_done()
             q.task_done()
 
 
 
 
-def fetch_data(url):
-    response = urlopen(url)
-    # time.sleep(5)
-    data = None
-    # data = json.load(response)
+def get_paper(paper_id):
+    rsp = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS1})
+    rsp.raise_for_status()
+    return rsp.json()
+
+
+def get_citations(paper_id):
+    edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations',
+                               headers={'x-api-key': S2_API_KEY},
+                               params={'fields': QUERY_FIELDS2})
+    return list(edge['citingPaper'] for edge in edges)
+
+
+def get_references(paper_id):
+    edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references',
+                               headers={'x-api-key': S2_API_KEY},
+                               params={'fields': QUERY_FIELDS2})
+    return list(edge['citedPaper'] for edge in edges)
+
+
+def get_related_pages(paper_id):
+    rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
+                       headers={'x-api-key': S2_API_KEY},
+                       params={'fields': QUERY_FIELDS3})
+    rsp.raise_for_status()
+    return rsp.json()['papers']
+
+
+def get_citation_edges(**req_kwargs):
+    """This helps with API endpoints that involve paging."""
+    page_size = 1000
+    offset = 0
+    while True:
+        req_kwargs.setdefault('params', dict())
+        req_kwargs['params']['limit'] = page_size
+        req_kwargs['params']['offset'] = offset
+        rsp = requests.get(**req_kwargs)
+        rsp.raise_for_status()
+
+        page = rsp.json()["data"]
+        for element in page:
+            yield element
+
+        if len(page) < page_size:
+            break  # no more pages
+        offset += page_size
+
+
+def fetch_data(paper_id):
+    print("fetching data:", paper_id)
+    data = get_paper(paper_id)
+    # print(paper)
+    data['citations'] = get_citations(paper_id)
+    data['references'] = get_references(paper_id)
+    data['relatedPages'] = get_related_pages(paper_id)
+
     return data if isinstance(data, dict) else None
     return data if isinstance(data, dict) else None