|
@@ -58,7 +58,7 @@ def add_paper(file_path):
|
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
def crawl_data():
|
|
- papers_data.create_index("corpusid", unique=True)
|
|
|
|
|
|
+ papers_data.create_index("corpusId", unique=True)
|
|
|
|
|
|
# 创建任务队列和线程
|
|
# 创建任务队列和线程
|
|
q = Queue()
|
|
q = Queue()
|
|
@@ -73,12 +73,14 @@ def crawl_data():
|
|
|
|
|
|
# 从数据库中读取 URL,加入任务队列
|
|
# 从数据库中读取 URL,加入任务队列
|
|
for data in papers.find():
|
|
for data in papers.find():
|
|
|
|
+ url = data["url"]
|
|
|
|
+ corpusid = data["corpusid"]
|
|
if 'consumed' in data.keys() and data['consumed'] is True:
|
|
if 'consumed' in data.keys() and data['consumed'] is True:
|
|
|
|
+ print(corpusid, "already inserted")
|
|
continue
|
|
continue
|
|
# print(data['corpusid'])
|
|
# print(data['corpusid'])
|
|
# print(data['url'])
|
|
# print(data['url'])
|
|
- url = data["url"]
|
|
|
|
- corpusid = data["corpusid"]
|
|
|
|
|
|
+
|
|
q.put((url, corpusid))
|
|
q.put((url, corpusid))
|
|
break
|
|
break
|
|
|
|
|