ソースを参照

modify 分批插入paper列表

tangs 11 ヶ月 前
コミット
70be831bed
1 ファイル変更15 行追加2 行削除
  1. 15 2
      spider.py

+ 15 - 2
spider.py

@@ -53,8 +53,21 @@ def add_paper(file_path):
     inserted_ids = 0
 
     try:
-        result = papers.insert_many(data_list, ordered=False)
-        inserted_ids = len(result.inserted_ids)
+        sub_list = []
+        for line in data_list:
+            sub_list.append(line)
+
+            if len(sub_list) == 2000:
+                result = papers.insert_many(sub_list, ordered=False)
+                inserted_ids += len(result.inserted_ids)
+                sub_list = []
+
+        if sub_list:
+            result = papers.insert_many(sub_list, ordered=False)
+            inserted_ids += len(result.inserted_ids)
+            sub_list = []
+
+        print('-------process', inserted_ids, '/', len(data_list))
     except pymongo.errors.BulkWriteError as e:
         inserted_ids = e.details['nInserted']
     finally: