提交 27e85f48 作者: 薛凌堃

习近平讲话脚本调整

上级 8bec5313
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables):
try:
del post_dict['is_repeat']
del post_dict['tags']
del post_dict['title_pd']
# 发送kafka
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
# kafka_result = producer.send("research_center_fourth",
# json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
#
# print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
......@@ -122,21 +123,22 @@ def get_content():
except:
print('请求错误1')
continue
for data_dict in data_list[::-1]:
# for data_dict in data_list[::-1]:
for data_dict in data_list[:1]:
article_id = data_dict['article_id']
print(type(article_id))
is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
if is_article_id:
continue
title = data_dict['title']
# is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
# if is_article_id:
# continue
title = data_dict['title'] # 采集到的标题
pub_time = data_dict['input_date']
current_date = datetime.now()
yesterday = current_date - timedelta(days=1)
# 格式化日期
yesterday_date = yesterday.strftime("%Y-%m-%d")
if pub_time <= yesterday_date:
continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
# if pub_time <= yesterday_date:
# continue
title_dict_list = db_storage.find({'title_pd': title.replace(' ', ''), 'is_repeat': ''}) # 如果找到一样的标题 判断三天之内是否有重复的
is_repeat = ''
for title_dict in title_dict_list:
pub_time1 = title_dict['publishDate']
......@@ -152,6 +154,14 @@ def get_content():
doc_href = pq(href_text)
content_html1 = str(doc_href('.d2txt_con.clearfix'))
content_html2 = str(doc_href('.editor.clearfix'))
#rtodo: 找到标题并拼接
title1 = doc_href('.d2txt.clearfix h2').text()
title2 = doc_href('.d2txt.clearfix h1').text()
title3 = doc_href('.d2txt.clearfix h3').text()
if title1 == '' and title3 == '':
title_final = title
else:
title_final = title1 + ' ' + title2 + ' ' + title3
except:
print('请求错误2')
continue
......@@ -170,7 +180,8 @@ def get_content():
origin = data_dict['origin_name']
a_dict = {
'id': "1534423014825668610" + article_id,
'title': title,
'title': title_final,
'title_pd': title,
'author': '',
'origin': origin,
'contentWithTag': content_html,
......@@ -183,6 +194,7 @@ def get_content():
}
art_content_dict[article_id] = a_dict
db_a_dict = a_dict.copy()
db_a_dict['title_pd'] = title.replace(' ', '')
db_storage.insert_one(db_a_dict)
if is_repeat == '':
print(href)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论