|
@@ -117,7 +117,7 @@ def worker(q):
|
|
|
filter = {'corpusId': corpus_id}
|
|
|
update = {'$set': data}
|
|
|
result = papers_data.update_one(filter, update, upsert=True)
|
|
|
- # mark_data_as_consumed(corpus_id)
|
|
|
+ mark_data_as_consumed(corpus_id)
|
|
|
print(result.upserted_id, "inserted successfully")
|
|
|
except Exception as error:
|
|
|
# handle the exception
|
|
@@ -148,6 +148,7 @@ def get_references(paper_id):
|
|
|
return list(edge['citedPaper'] for edge in edges)
|
|
|
|
|
|
|
|
|
+# 接口存在人机验证
|
|
|
def get_related_pages(paper_id):
|
|
|
rsp = requests.get(url=f'https://www.semanticscholar.org/api/1/paper/{paper_id}/related-papers?limit=10&recommenderType=relatedPapers',
|
|
|
headers={'x-api-key': S2_API_KEY},
|
|
@@ -156,6 +157,14 @@ def get_related_pages(paper_id):
|
|
|
return rsp.json()['papers']
|
|
|
|
|
|
|
|
|
+def get_recommender_pages(paper_id):
|
|
|
+ rsp = requests.get(url=f'https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}',
|
|
|
+ headers={'x-api-key': S2_API_KEY},
|
|
|
+ params={'fields': QUERY_FIELDS2})
|
|
|
+ rsp.raise_for_status()
|
|
|
+ return rsp.json()['recommendedPapers']
|
|
|
+
|
|
|
+
|
|
|
def get_citation_edges(**req_kwargs):
|
|
|
"""This helps with API endpoints that involve paging."""
|
|
|
page_size = 1000
|
|
@@ -182,7 +191,7 @@ def fetch_data(paper_id):
|
|
|
# print(paper)
|
|
|
data['citations'] = get_citations(paper_id)
|
|
|
data['references'] = get_references(paper_id)
|
|
|
- data['relatedPages'] = get_related_pages(paper_id)
|
|
|
+ data['recommenderPages'] = get_recommender_pages(paper_id)
|
|
|
|
|
|
return data if isinstance(data, dict) else None
|
|
|
|
|
@@ -193,7 +202,7 @@ def onSigInt(signo, frame):
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# 主进程退出信号
|
|
|
- signal.signal(signal.SIGINT, onSigInt)
|
|
|
+ # signal.signal(signal.SIGINT, onSigInt)
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Crawl data from URLs")
|
|
|
parser.add_argument(
|