|
@@ -1,10 +1,18 @@
|
|
|
+import requests
|
|
|
+import os
|
|
|
import signal
|
|
|
import argparse
|
|
|
import json
|
|
|
import pymongo
|
|
|
from queue import Queue
|
|
|
from threading import Thread
|
|
|
-from urllib.request import urlopen
|
|
|
+from urllib.parse import urlparse
|
|
|
+
|
|
|
+# S2_API_KEY = os.getenv('S2_API_KEY')
|
|
|
+S2_API_KEY = 'b4YUQrO6w07Zyx9LN8V3p5Lg0WrrGDK520fWJfYd'
|
|
|
+QUERY_FIELDS1 = 'paperId,corpusId,title,authors,year,url,tldr,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
|
|
|
+QUERY_FIELDS2 = 'paperId,corpusId,title,authors,year,url,venue,externalIds,fieldsOfStudy,s2FieldsOfStudy,abstract,citationCount,referenceCount,publicationTypes,influentialCitationCount,publicationDate,journal'
|
|
|
+QUERY_FIELDS3 = 'paperId,corpusId,title,authors'
|
|
|
|
|
|
# 读取配置文件中的数据库参数
|
|
|
with open("config.json", "r") as f:
|
|
@@ -87,8 +95,9 @@ def crawl_data():
|
|
|
t.join()
|
|
|
|
|
|
|
|
|
-def mark_data_as_consumed(id):
|
|
|
- papers.update_one({'_id': id}, {'$set': {'consumed': True}})
|
|
|
+def mark_data_as_consumed(corpus_id):
|
|
|
+ result = papers.update_one({'corpusid': corpus_id}, {
|
|
|
+ '$set': {'consumed': True}})
|
|
|
|
|
|
|
|
|
def worker(q):
|
|
@@ -96,16 +105,20 @@ def worker(q):
|
|
|
item = q.get()
|
|
|
if item is None:
|
|
|
break
|
|
|
- print('crawling data: {}'.format(item[0]))
|
|
|
+ url = urlparse(item[0]).path
|
|
|
+ paper_id = url.split('/')[-1]
|
|
|
+ corpus_id = item[1]
|
|
|
+ print('crawling {} data: {}'.format(corpus_id, url))
|
|
|
+
|
|
|
try:
|
|
|
- data = fetch_data(item[0])
|
|
|
- data = {
|
|
|
- 'url': item[0],
|
|
|
- 'corpusid': item[1]
|
|
|
- }
|
|
|
+ data = fetch_data(paper_id)
|
|
|
if data is not None:
|
|
|
- papers_data.insert_one(data)
|
|
|
- mark_data_as_consumed(item[1])
|
|
|
+ # papers_data.insert_one(data)
|
|
|
+ filter = {'corpusId': corpus_id}
|
|
|
+ update = {'$set': data}
|
|
|
+ result = papers_data.update_one(filter, update, upsert=True)
|
|
|
+ # mark_data_as_consumed(corpus_id)
|
|
|
+ print(result.upserted_id, "inserted successfully")
|
|
|
except Exception as error:
|
|
|
# handle the exception
|
|
|
print("An exception occurred:", error)
|
|
@@ -113,11 +126,64 @@ def worker(q):
|
|
|
q.task_done()
|
|
|
|
|
|
|
|
|
-def fetch_data(url):
|
|
|
- response = urlopen(url)
|
|
|
- # time.sleep(5)
|
|
|
- data = None
|
|
|
- # data = json.load(response)
|
|
|
+def get_paper(paper_id):
|
|
|
+ rsp = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS1})
|
|
|
+ rsp.raise_for_status()
|
|
|
+ return rsp.json()
|
|
|
+
|
|
|
+
|
|
|
+def get_citations(paper_id):
|
|
|
+ edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS2})
|
|
|
+ return list(edge['citingPaper'] for edge in edges)
|
|
|
+
|
|
|
+
|
|
|
+def get_references(paper_id):
|
|
|
+ edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS2})
|
|
|
+ return list(edge['citedPaper'] for edge in edges)
|
|
|
+
|
|
|
+
|
|
|
+def get_related_pages(paper_id):
|
|
|
+ rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS3})
|
|
|
+ rsp.raise_for_status()
|
|
|
+ return rsp.json()['papers']
|
|
|
+
|
|
|
+
|
|
|
+def get_citation_edges(**req_kwargs):
|
|
|
+ """This helps with API endpoints that involve paging."""
|
|
|
+ page_size = 1000
|
|
|
+ offset = 0
|
|
|
+ while True:
|
|
|
+ req_kwargs.setdefault('params', dict())
|
|
|
+ req_kwargs['params']['limit'] = page_size
|
|
|
+ req_kwargs['params']['offset'] = offset
|
|
|
+ rsp = requests.get(**req_kwargs)
|
|
|
+ rsp.raise_for_status()
|
|
|
+
|
|
|
+ page = rsp.json()["data"]
|
|
|
+ for element in page:
|
|
|
+ yield element
|
|
|
+
|
|
|
+ if len(page) < page_size:
|
|
|
+ break # no more pages
|
|
|
+ offset += page_size
|
|
|
+
|
|
|
+
|
|
|
+def fetch_data(paper_id):
|
|
|
+ print("fetching data:", paper_id)
|
|
|
+ data = get_paper(paper_id)
|
|
|
+ # print(paper)
|
|
|
+ data['citations'] = get_citations(paper_id)
|
|
|
+ data['references'] = get_references(paper_id)
|
|
|
+ data['relatedPages'] = get_related_pages(paper_id)
|
|
|
+
|
|
|
return data if isinstance(data, dict) else None
|
|
|
|
|
|
|