提交 27e85f48 作者: 薛凌堃

习近平讲话脚本调整

上级 8bec5313
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables): ...@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables):
try: try:
del post_dict['is_repeat'] del post_dict['is_repeat']
del post_dict['tags'] del post_dict['tags']
del post_dict['title_pd']
# 发送kafka # 发送kafka
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20) # producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth", # kafka_result = producer.send("research_center_fourth",
json.dumps(post_dict, ensure_ascii=False).encode('utf8')) # json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
#
print(kafka_result.get(timeout=10)) # print(kafka_result.get(timeout=10))
dic_result = { dic_result = {
'success': 'ture', 'success': 'ture',
...@@ -122,21 +123,22 @@ def get_content(): ...@@ -122,21 +123,22 @@ def get_content():
except: except:
print('请求错误1') print('请求错误1')
continue continue
for data_dict in data_list[::-1]: # for data_dict in data_list[::-1]:
for data_dict in data_list[:1]:
article_id = data_dict['article_id'] article_id = data_dict['article_id']
print(type(article_id)) print(type(article_id))
is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"}) # is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
if is_article_id: # if is_article_id:
continue # continue
title = data_dict['title'] title = data_dict['title'] # 采集到的标题
pub_time = data_dict['input_date'] pub_time = data_dict['input_date']
current_date = datetime.now() current_date = datetime.now()
yesterday = current_date - timedelta(days=1) yesterday = current_date - timedelta(days=1)
# 格式化日期 # 格式化日期
yesterday_date = yesterday.strftime("%Y-%m-%d") yesterday_date = yesterday.strftime("%Y-%m-%d")
if pub_time <= yesterday_date: # if pub_time <= yesterday_date:
continue # continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''}) title_dict_list = db_storage.find({'title_pd': title.replace(' ', ''), 'is_repeat': ''}) # 如果找到一样的标题 判断三天之内是否有重复的
is_repeat = '' is_repeat = ''
for title_dict in title_dict_list: for title_dict in title_dict_list:
pub_time1 = title_dict['publishDate'] pub_time1 = title_dict['publishDate']
...@@ -152,6 +154,14 @@ def get_content(): ...@@ -152,6 +154,14 @@ def get_content():
doc_href = pq(href_text) doc_href = pq(href_text)
content_html1 = str(doc_href('.d2txt_con.clearfix')) content_html1 = str(doc_href('.d2txt_con.clearfix'))
content_html2 = str(doc_href('.editor.clearfix')) content_html2 = str(doc_href('.editor.clearfix'))
#rtodo: 找到标题并拼接
title1 = doc_href('.d2txt.clearfix h2').text()
title2 = doc_href('.d2txt.clearfix h1').text()
title3 = doc_href('.d2txt.clearfix h3').text()
if title1 == '' and title3 == '':
title_final = title
else:
title_final = title1 + ' ' + title2 + ' ' + title3
except: except:
print('请求错误2') print('请求错误2')
continue continue
...@@ -170,7 +180,8 @@ def get_content(): ...@@ -170,7 +180,8 @@ def get_content():
origin = data_dict['origin_name'] origin = data_dict['origin_name']
a_dict = { a_dict = {
'id': "1534423014825668610" + article_id, 'id': "1534423014825668610" + article_id,
'title': title, 'title': title_final,
'title_pd': title,
'author': '', 'author': '',
'origin': origin, 'origin': origin,
'contentWithTag': content_html, 'contentWithTag': content_html,
...@@ -183,6 +194,7 @@ def get_content(): ...@@ -183,6 +194,7 @@ def get_content():
} }
art_content_dict[article_id] = a_dict art_content_dict[article_id] = a_dict
db_a_dict = a_dict.copy() db_a_dict = a_dict.copy()
db_a_dict['title_pd'] = title.replace(' ', '')
db_storage.insert_one(db_a_dict) db_storage.insert_one(db_a_dict)
if is_repeat == '': if is_repeat == '':
print(href) print(href)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论