Browse Source

modify 遍历consumed的数据时添加try except和重试的机制;添加fetch数据日志

tangs 1 year ago
parent
commit
9e5881c49b
1 changed files with 16 additions and 7 deletions
  1. 16 7
      spider.py

+ 16 - 7
spider.py

@@ -80,7 +80,7 @@ def add_paper(file_path):
 
 
 def crawl_data():
-    papers_data.create_index("corpusId", unique=True)
+    # papers_data.create_index("corpusId", unique=True)
 
     # 创建任务队列和线程
     q = Queue(TASK_QUEUE_LEN)
@@ -94,14 +94,20 @@ def crawl_data():
         threads.append(t)
 
     # 从数据库中读取 URL,加入任务队列
-    for data in papers.find({'$or': [{'consumed': {'$exists': False}}, {'consumed': False}]}):
-        if quit_flag:
+    while True:
+        try:
+            for data in papers.find({'$or': [{'consumed': {'$exists': False}}, {'consumed': False}]}):
+                if quit_flag:
+                    break
+                if 'consumed' in data and data['consumed']:
+                    print(data['corpusid'], "already inserted")
+                    continue
+                print('add {} to the task queue'.format(data['corpusid']))
+                q.put((data['url'], data['corpusid']))
             break
-        if 'consumed' in data and data['consumed']:
-            print(data['corpusid'], "already inserted")
+        except Exception as e:
+            print('crawl_data error', e)
             continue
-        print('add {} to the task queue'.format(data['corpusid']))
-        q.put((data['url'], data['corpusid']))
 
     #
     print("Waitting for the task queue to complete...")
@@ -213,6 +219,9 @@ def fetch_data(paper_id):
     data['citations'] = get_citations(paper_id)
     data['references'] = get_references(paper_id)
     data['recommendedPapers'] = get_recommended_papers(paper_id)
+    print('>>> fetch data OK, citations: {0}, references: {1}, recommendedPapers: {2}'.format(
+        len(data.get('citations', [])), len(data.get('references', [])), len(data.get('recommendedPapers', []))
+    ))
 
     return data if isinstance(data, dict) else None