|
@@ -30,13 +30,21 @@ def read_file(filename):
|
|
|
|
|
|
|
|
|
|
def add_paper(file_path):
|
|
def add_paper(file_path):
|
|
|
|
+ papers.create_index("corpusid", unique=True)
|
|
# 读取 paper 文件,存入数据库
|
|
# 读取 paper 文件,存入数据库
|
|
data_list = read_file(file_path)
|
|
data_list = read_file(file_path)
|
|
- print(len(data_list))
|
|
|
|
# 批量插入数据
|
|
# 批量插入数据
|
|
- result = papers.insert_many(data_list)
|
|
|
|
- # 输出插入结果
|
|
|
|
- print(result.inserted_ids)
|
|
|
|
|
|
+ inserted_ids = 0
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ result = papers.insert_many(data_list, ordered=False)
|
|
|
|
+ inserted_ids = len(result.inserted_ids)
|
|
|
|
+ except pymongo.errors.BulkWriteError as e:
|
|
|
|
+ inserted_ids = e.details['nInserted']
|
|
|
|
+ finally:
|
|
|
|
+ # 输出插入结果
|
|
|
|
+ print("总插入数据: {0}, 已插入数据: {1}, 已存在数据: {2}" .format(
|
|
|
|
+ len(data_list), inserted_ids, papers.count_documents({})))
|
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
def crawl_data():
|