提交 4f718511 作者: 薛凌堃

习近平讲话

上级 7ef6f432
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author: MENG # @Author: MENG
# @Time : 2022-3-18 # @Time : 2022-3-18
import redis
import requests import requests
from langid import langid
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
import time import time
import json import json
import pymongo import pymongo
from kafka import KafkaProducer
from requests.packages import urllib3 from requests.packages import urllib3
urllib3.disable_warnings() urllib3.disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['人民网-习讲话数据库_copy'] db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['人民网-习讲话数据库_copy']
def newsdata(art_content_dict,art_type_dict,dic_lables):
for key, value in art_content_dict.items():
labels = []
tags = art_type_dict.get(key)
if tags is None:
tags = []
value_new = value
value_new['tags'] = tags
# todo:lables映射
for tag in tags:
labelRemarks = tag['type']
relationName = tag['name']
item = labelRemarks + "|" + relationName
item_value = dic_lables[item]
labelMark = item_value.split("|")[0]
relationId = item_value.split("|")[1]
label = {
"labelMark": labelMark,
"labelRemarks": labelRemarks,
"relationId": relationId,
"relationName": relationName
}
labels.append(label)
value_new['labels'] = labels
value_new['subjectId'] = "1534423014825668610"
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
value_new['createDate'] = time_now
value_new['checkStatus'] = "1"
value_new['deleteFlag'] = "0"
value_new['topNum'] = "0"
value_new['summary'] = ""
post_dict = value_new
for i in range(5):
try:
db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
break
except:
time.sleep(2)
continue
if post_dict['is_repeat'] == '1':
continue
try:
del post_dict['is_repeat']
del post_dict['tags']
# 发送kafka
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
old_dic = post_dict
try:
del old_dic['contentWithTag']
del old_dic['lang']
del old_dic['labels']
del old_dic['createDate']
del old_dic['checkStatus']
del old_dic['deleteFlag']
del old_dic['topNum']
del old_dic['summary']
# post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
# headers = {'Content-Type': 'application/json'}
# resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(old_dic)).json()
# print('推送:', resp_json['msg'])
except:
print('数据传接口失败,正在重试!')
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
print(dic_result)
time.sleep(5)
db_storage.delete_one({'id': post_dict['id']})
continue
# 习讲话数据库 新增数据 # 习讲话数据库 新增数据
def get_content(): def get_content():
...@@ -23,7 +112,7 @@ def get_content(): ...@@ -23,7 +112,7 @@ def get_content():
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg' 'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
} }
for page in range(3, 0, -1): for page in range(9, 0, -1):
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2" url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
try: try:
resp_json = requests.request("GET", url, headers=headers, verify=False).json() resp_json = requests.request("GET", url, headers=headers, verify=False).json()
...@@ -39,6 +128,8 @@ def get_content(): ...@@ -39,6 +128,8 @@ def get_content():
continue continue
title = data_dict['title'] title = data_dict['title']
pub_time = data_dict['input_date'] pub_time = data_dict['input_date']
if pub_time <= '2023-11-06':
continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''}) title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
is_repeat = '' is_repeat = ''
for title_dict in title_dict_list: for title_dict in title_dict_list:
...@@ -60,19 +151,28 @@ def get_content(): ...@@ -60,19 +151,28 @@ def get_content():
continue continue
content_html = content_html1 + '\n' + content_html2 content_html = content_html1 + '\n' + content_html2
content = pq(content_html).text() content = pq(content_html).text()
lang = langid.classify(content)
if lang == '':
lang = 'cn'
if lang[0] == '':
lang = 'cn'
else:
lang = lang[0]
if content.strip() == '': if content.strip() == '':
print(href, '内容为空') print(href, '内容为空')
continue continue
origin = data_dict['origin_name'] origin = data_dict['origin_name']
a_dict = { a_dict = {
'id': article_id, 'id': "1534423014825668610" + article_id,
'title': title, 'title': title,
'author': '', 'author': '',
'origin': origin, 'origin': origin,
'content': content_html, 'contentWithTag': content_html,
'content': content,
'publishDate': pub_time, 'publishDate': pub_time,
'sourceAddress': href, 'sourceAddress': href,
'tags': [], 'tags': [],
'lang': lang,
'is_repeat': is_repeat 'is_repeat': is_repeat
} }
art_content_dict[article_id] = a_dict art_content_dict[article_id] = a_dict
...@@ -86,9 +186,47 @@ def get_content(): ...@@ -86,9 +186,47 @@ def get_content():
result_lists = [ result_lists = [
['类型', '讲话', '706', '69'], ['类型', '会议', '701', '178'], ['类型', '活动', '702', '63'], ['类型', '考察', '703', '72'], ['类型', '讲话', '706', '69'], ['类型', '会议', '701', '178'], ['类型', '活动', '702', '63'], ['类型', '考察', '703', '72'],
['类型', '会见', '704', '174'], ['类型', '出访', '705', '188'], ['类型', '函电', '707', '194'], ['类型', '其他', '708', '203'], ['类型', '会见', '704', '174'], ['类型', '出访', '705', '188'], ['类型', '函电', '707', '194'], ['类型', '其他', '708', '203'],
['时间', '2023', '2023', '11'], ['时间', '2022', '2022', '10'], ['时间', '2021', '2021', '9'],
['时间', '2019', '2019', '8'],
['时间', '2018', '2018', '7'], ['时间', '2017', '2017', '6'], ['时间', '2016', '2016', '5'],
['时间', '2015', '2015', '4'],
['时间', '2014', '2014', '3'], ['时间', '2013', '2013', '2'], ['时间', '2012', '2012', '1'],
['领域', '经济', '101', '18'], ['领域', '政治', '102', '21'], ['领域', '文化', '103', '14'], ['领域', '社会', '104', '15'], ['领域', '经济', '101', '18'], ['领域', '政治', '102', '21'], ['领域', '文化', '103', '14'], ['领域', '社会', '104', '15'],
['领域', '生态', '105', '7'], ['领域', '党建', '106', '9'], ['领域', '国防', '107', '6'], ['领域', '外交', '108', '50'], ['领域', '生态', '105', '7'], ['领域', '党建', '106', '9'], ['领域', '国防', '107', '6'], ['领域', '外交', '108', '50'],
] ]
dic_lables = {
"类型|讲话": "important_speech_type|1700334917807710209",
"类型|会议": "important_speech_type|1700334936166178818",
"类型|活动": "important_speech_type|1700334960560250881",
"类型|考察": "important_speech_type|1700334978285379585",
"类型|会见": "important_speech_type|1700335044605714433",
"类型|出访": "important_speech_type|1700335078852206593",
"类型|函电": "important_speech_type|1700335099689508866",
"类型|其他": "important_speech_type|1700335118056366082",
"时间|2012": "important_speech_time|1700334545970077697",
"时间|2013": "important_speech_time|1700334647757447170",
"时间|2014": "important_speech_time|1700334667915272194",
"时间|2015": "important_speech_time|1700334686550564865",
"时间|2016": "important_speech_time|1700334704925810689",
"时间|2017": "important_speech_time|1700334722529304578",
"时间|2018": "important_speech_time|1700334738320859137",
"时间|2019": "important_speech_time|1700334758302523393",
"时间|2020": "important_speech_time|1700334777827008514",
"时间|2021": "important_speech_time|1700334797477322753",
"时间|2022": "important_speech_time|1700334814468448258",
"时间|2023": "important_speech_time|1700334832495566850",
"领域|经济": "important_speech_area|1700335225803841537",
"领域|政治": "important_speech_area|1700335248096567297",
"领域|文化": "important_speech_area|1700335379638329345",
"领域|社会": "important_speech_area|1700335412873994242",
"领域|生态": "important_speech_area|1700335541211308033",
"领域|党建": "important_speech_area|1700335587780665346",
"领域|国防": "important_speech_area|1700335615895085058",
"领域|外交": "important_speech_area|1700335820430319618",
}
for result_list in result_lists: for result_list in result_lists:
sort = result_list[0] sort = result_list[0]
sort_text = result_list[1] sort_text = result_list[1]
...@@ -96,13 +234,19 @@ def get_content(): ...@@ -96,13 +234,19 @@ def get_content():
if sort == '类型': if sort == '类型':
form = result_list[2] form = result_list[2]
type_ = '0' type_ = '0'
year = '0'
elif sort == '时间':
form = '0'
type_ = '0'
year = result_list[2]
else: else:
form = '0' form = '0'
type_ = result_list[2] type_ = result_list[2]
year = '0'
# total_page = result_list[3] # total_page = result_list[3]
total_page = 2 total_page = 10
for page in range(1, int(total_page)): for page in range(1, int(total_page)):
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2" url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year={year}&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
payload = {} payload = {}
try: try:
resp_json = requests.request("GET", url, headers=headers, verify=False, data=payload).json() resp_json = requests.request("GET", url, headers=headers, verify=False, data=payload).json()
...@@ -122,52 +266,24 @@ def get_content(): ...@@ -122,52 +266,24 @@ def get_content():
type_lists = [type_dict] type_lists = [type_dict]
else: else:
type_lists = type_list + [type_dict] type_lists = type_list + [type_dict]
art_type_dict[article_id] = type_lists new_lst = []
for key, value in art_content_dict.items():
tags = art_type_dict.get(key) # 遍历原列表的字典元素
if tags is None: for item in type_lists:
tags = [] # 如果字典元素不在新列表中,则添加到新列表中
value['tags'] = tags if item not in new_lst:
post_dict = value new_lst.append(item)
db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
if post_dict['is_repeat'] == '1': art_type_dict[article_id] = new_lst
continue
try: newsdata(art_content_dict, art_type_dict, dic_lables)
del post_dict['is_repeat']
# labels = []
# for tags_dict in post_dict['tags']:
# labels_dict = {
# 'abelRemarks': tags_dict.get('type'),
# 'relationName': tags_dict.get('name'),
# }
# labels.append(labels_dict)
# aaa_dict = {
# 'sid': '1533647545473859586',
# 'title': post_dict['title'],
# 'content': '',
# 'contentWithTag': post_dict['content'],
# 'summary': '',
# 'author': '',
# 'origin': post_dict['origin'],
# 'publishDate': post_dict['publishDate'],
# 'sourceAddress': post_dict['sourceAddress'],
# 'labels': labels
# }
post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
headers = {'Content-Type': 'application/json'}
resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(post_dict)).json()
print('推送:', resp_json['msg'])
except:
print('数据传接口失败,正在重试!')
time.sleep(5)
db_storage.delete_one({'id': post_dict['id']})
continue
if __name__ == '__main__': if __name__ == '__main__':
try: try:
get_content() get_content()
except Exception as e: except Exception as e:
print(e)
pass pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论