Parcourir la source

fix add_paper

Ben il y a 1 an
Parent
commit
4cbd861f0a
1 fichiers modifiés avec 12 ajouts et 4 suppressions
  1. 12 4
      spider.py

+ 12 - 4
spider.py

@@ -30,13 +30,21 @@ def read_file(filename):
 
 
 def add_paper(file_path):
+    papers.create_index("corpusid", unique=True)
     # 读取 paper 文件,存入数据库
     data_list = read_file(file_path)
-    print(len(data_list))
     # 批量插入数据
-    result = papers.insert_many(data_list)
-    # 输出插入结果
-    print(result.inserted_ids)
+    inserted_ids = 0
+
+    try:
+        result = papers.insert_many(data_list, ordered=False)
+        inserted_ids = len(result.inserted_ids)
+    except pymongo.errors.BulkWriteError as e:
+        inserted_ids = e.details['nInserted']
+    finally:
+        # 输出插入结果
+        print("总插入数据: {0}, 已插入数据: {1}, 已存在数据: {2}" .format(
+            len(data_list), inserted_ids, papers.count_documents({})))
 
 
 def crawl_data():