Pārlūkot izejas kodu

modify 调整paper的读取为按行读取

tangs 11 mēneši atpakaļ
vecāks
revīzija
ae068d4e5a
1 mainītis faili ar 13 papildinājumiem un 11 dzēšanām
  1. 13 11
      spider.py

+ 13 - 11
spider.py

@@ -48,32 +48,34 @@ def read_file(filename):
 def add_paper(file_path):
     papers.create_index("corpusid", unique=True)
     # 读取 paper 文件,存入数据库
-    data_list = read_file(file_path)
+    # data_list = read_file(file_path)
     # 批量插入数据
     inserted_ids = 0
 
     try:
         sub_list = []
-        for line in data_list:
-            sub_list.append(line)
+        with open(file_path, 'r') as f:
+            for line in f:
+                line_dict = json.loads(line)
+                sub_list.append(line_dict)
 
-            if len(sub_list) == 2000:
+                if len(sub_list) == 2000:
+                    result = papers.insert_many(sub_list, ordered=False)
+                    inserted_ids += len(result.inserted_ids)
+                    sub_list = []
+
+            if sub_list:
                 result = papers.insert_many(sub_list, ordered=False)
                 inserted_ids += len(result.inserted_ids)
                 sub_list = []
 
-        if sub_list:
-            result = papers.insert_many(sub_list, ordered=False)
-            inserted_ids += len(result.inserted_ids)
-            sub_list = []
-
-        print('-------process', inserted_ids, '/', len(data_list))
+        print('-------process', inserted_ids, '/', '7318795')
     except pymongo.errors.BulkWriteError as e:
         inserted_ids = e.details['nInserted']
     finally:
         # 输出插入结果
         print("总插入数据: {0}, 已插入数据: {1}, 已存在数据: {2}" .format(
-            len(data_list), inserted_ids, papers.count_documents({})))
+            7318795, inserted_ids, papers.count_documents({})))
 
 
 def crawl_data():