|
@@ -58,7 +58,7 @@ def add_paper(file_path):
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
|
- papers_data.create_index("corpusid", unique=True)
|
|
|
+ papers_data.create_index("corpusId", unique=True)
|
|
|
|
|
|
# 创建任务队列和线程
|
|
|
q = Queue()
|
|
@@ -73,12 +73,14 @@ def crawl_data():
|
|
|
|
|
|
# 从数据库中读取 URL,加入任务队列
|
|
|
for data in papers.find():
|
|
|
+ url = data["url"]
|
|
|
+ corpusid = data["corpusid"]
|
|
|
if 'consumed' in data.keys() and data['consumed'] is True:
|
|
|
+ print(corpusid, "already inserted")
|
|
|
continue
|
|
|
# print(data['corpusid'])
|
|
|
# print(data['url'])
|
|
|
- url = data["url"]
|
|
|
- corpusid = data["corpusid"]
|
|
|
+
|
|
|
q.put((url, corpusid))
|
|
|
break
|
|
|
|
|
@@ -117,7 +119,7 @@ def worker(q):
|
|
|
filter = {'corpusId': corpus_id}
|
|
|
update = {'$set': data}
|
|
|
result = papers_data.update_one(filter, update, upsert=True)
|
|
|
- # mark_data_as_consumed(corpus_id)
|
|
|
+ mark_data_as_consumed(corpus_id)
|
|
|
print(result.upserted_id, "inserted successfully")
|
|
|
except Exception as error:
|
|
|
# handle the exception
|
|
@@ -148,6 +150,7 @@ def get_references(paper_id):
|
|
|
return list(edge['citedPaper'] for edge in edges)
|
|
|
|
|
|
|
|
|
+# 接口存在人机验证
|
|
|
def get_related_pages(paper_id):
|
|
|
rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
|
|
|
headers={'x-api-key': S2_API_KEY},
|
|
@@ -156,6 +159,14 @@ def get_related_pages(paper_id):
|
|
|
return rsp.json()['papers']
|
|
|
|
|
|
|
|
|
+def get_recommender_pages(paper_id):
|
|
|
+ rsp = requests.get(url=f'https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS2})
|
|
|
+ rsp.raise_for_status()
|
|
|
+ return rsp.json()['recommendedPapers']
|
|
|
+
|
|
|
+
|
|
|
def get_citation_edges(**req_kwargs):
|
|
|
"""This helps with API endpoints that involve paging."""
|
|
|
page_size = 1000
|
|
@@ -182,7 +193,7 @@ def fetch_data(paper_id):
|
|
|
# print(paper)
|
|
|
data['citations'] = get_citations(paper_id)
|
|
|
data['references'] = get_references(paper_id)
|
|
|
- data['relatedPages'] = get_related_pages(paper_id)
|
|
|
+ data['recommenderPages'] = get_recommender_pages(paper_id)
|
|
|
|
|
|
return data if isinstance(data, dict) else None
|
|
|
|
|
@@ -193,7 +204,7 @@ def onSigInt(signo, frame):
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# 主进程退出信号
|
|
|
- signal.signal(signal.SIGINT, onSigInt)
|
|
|
+ # signal.signal(signal.SIGINT, onSigInt)
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Crawl data from URLs")
|
|
|
parser.add_argument(
|