Browse Source

fix papers_data.create_index

Ben 1 year ago
parent
commit
f85a75602d
1 changed files with 5 additions and 3 deletions
  1. 5 3
      spider.py

+ 5 - 3
spider.py

@@ -58,7 +58,7 @@ def add_paper(file_path):
 
 
 def crawl_data():
-    papers_data.create_index("corpusid", unique=True)
+    papers_data.create_index("corpusId", unique=True)
 
     # 创建任务队列和线程
     q = Queue()
@@ -73,12 +73,14 @@ def crawl_data():
 
     # 从数据库中读取 URL,加入任务队列
     for data in papers.find():
+        url = data["url"]
+        corpusid = data["corpusid"]
         if 'consumed' in data.keys() and data['consumed'] is True:
+            print(corpusid, "already inserted")
             continue
         # print(data['corpusid'])
         # print(data['url'])
-        url = data["url"]
-        corpusid = data["corpusid"]
+
         q.put((url, corpusid))
         break