|
@@ -80,7 +80,7 @@ def add_paper(file_path):
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
|
- papers_data.create_index("corpusId", unique=True)
|
|
|
+ # papers_data.create_index("corpusId", unique=True)
|
|
|
|
|
|
# 创建任务队列和线程
|
|
|
q = Queue(TASK_QUEUE_LEN)
|
|
@@ -94,14 +94,20 @@ def crawl_data():
|
|
|
threads.append(t)
|
|
|
|
|
|
# 从数据库中读取 URL,加入任务队列
|
|
|
- for data in papers.find({'$or': [{'consumed': {'$exists': False}}, {'consumed': False}]}):
|
|
|
- if quit_flag:
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ for data in papers.find({'$or': [{'consumed': {'$exists': False}}, {'consumed': False}]}):
|
|
|
+ if quit_flag:
|
|
|
+ break
|
|
|
+ if 'consumed' in data and data['consumed']:
|
|
|
+ print(data['corpusid'], "already inserted")
|
|
|
+ continue
|
|
|
+ print('add {} to the task queue'.format(data['corpusid']))
|
|
|
+ q.put((data['url'], data['corpusid']))
|
|
|
break
|
|
|
- if 'consumed' in data and data['consumed']:
|
|
|
- print(data['corpusid'], "already inserted")
|
|
|
+ except Exception as e:
|
|
|
+ print('crawl_data error', e)
|
|
|
continue
|
|
|
- print('add {} to the task queue'.format(data['corpusid']))
|
|
|
- q.put((data['url'], data['corpusid']))
|
|
|
|
|
|
#
|
|
|
print("Waitting for the task queue to complete...")
|
|
@@ -213,6 +219,9 @@ def fetch_data(paper_id):
|
|
|
data['citations'] = get_citations(paper_id)
|
|
|
data['references'] = get_references(paper_id)
|
|
|
data['recommendedPapers'] = get_recommended_papers(paper_id)
|
|
|
+ print('>>> fetch data OK, citations: {0}, references: {1}, recommendedPapers: {2}'.format(
|
|
|
+ len(data.get('citations', [])), len(data.get('references', [])), len(data.get('recommendedPapers', []))
|
|
|
+ ))
|
|
|
|
|
|
return data if isinstance(data, dict) else None
|
|
|
|