提交 b2dd89c5 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-3-18
import redis
import requests
from langid import langid
from pyquery import PyQuery as pq
import time
import json
import pymongo
from kafka import KafkaProducer
from requests.packages import urllib3
urllib3.disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['人民网-习讲话数据库_copy']
def newsdata(art_content_dict,art_type_dict,dic_lables):
for key, value in art_content_dict.items():
labels = []
tags = art_type_dict.get(key)
if tags is None:
tags = []
value_new = value
value_new['tags'] = tags
# todo:lables映射
for tag in tags:
labelRemarks = tag['type']
relationName = tag['name']
item = labelRemarks + "|" + relationName
item_value = dic_lables[item]
labelMark = item_value.split("|")[0]
relationId = item_value.split("|")[1]
label = {
"labelMark": labelMark,
"labelRemarks": labelRemarks,
"relationId": relationId,
"relationName": relationName
}
labels.append(label)
value_new['labels'] = labels
value_new['subjectId'] = "1534423014825668610"
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
value_new['createDate'] = time_now
value_new['checkStatus'] = "1"
value_new['deleteFlag'] = "0"
value_new['topNum'] = "0"
value_new['summary'] = ""
post_dict = value_new
for i in range(5):
try:
db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
break
except:
time.sleep(2)
continue
if post_dict['is_repeat'] == '1':
continue
try:
del post_dict['is_repeat']
del post_dict['tags']
# 发送kafka
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
old_dic = post_dict
try:
del old_dic['contentWithTag']
del old_dic['lang']
del old_dic['labels']
del old_dic['createDate']
del old_dic['checkStatus']
del old_dic['deleteFlag']
del old_dic['topNum']
del old_dic['summary']
# post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
# headers = {'Content-Type': 'application/json'}
# resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(old_dic)).json()
# print('推送:', resp_json['msg'])
except:
print('数据传接口失败,正在重试!')
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
print(dic_result)
time.sleep(5)
db_storage.delete_one({'id': post_dict['id']})
continue
# 习讲话数据库 新增数据
def get_content():
......@@ -23,7 +112,7 @@ def get_content():
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
}
for page in range(3, 0, -1):
for page in range(9, 0, -1):
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
try:
resp_json = requests.request("GET", url, headers=headers, verify=False).json()
......@@ -39,6 +128,8 @@ def get_content():
continue
title = data_dict['title']
pub_time = data_dict['input_date']
if pub_time <= '2023-11-06':
continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
is_repeat = ''
for title_dict in title_dict_list:
......@@ -60,19 +151,28 @@ def get_content():
continue
content_html = content_html1 + '\n' + content_html2
content = pq(content_html).text()
lang = langid.classify(content)
if lang == '':
lang = 'cn'
if lang[0] == '':
lang = 'cn'
else:
lang = lang[0]
if content.strip() == '':
print(href, '内容为空')
continue
origin = data_dict['origin_name']
a_dict = {
'id': article_id,
'id': "1534423014825668610" + article_id,
'title': title,
'author': '',
'origin': origin,
'content': content_html,
'contentWithTag': content_html,
'content': content,
'publishDate': pub_time,
'sourceAddress': href,
'tags': [],
'lang': lang,
'is_repeat': is_repeat
}
art_content_dict[article_id] = a_dict
......@@ -86,9 +186,47 @@ def get_content():
result_lists = [
['类型', '讲话', '706', '69'], ['类型', '会议', '701', '178'], ['类型', '活动', '702', '63'], ['类型', '考察', '703', '72'],
['类型', '会见', '704', '174'], ['类型', '出访', '705', '188'], ['类型', '函电', '707', '194'], ['类型', '其他', '708', '203'],
['时间', '2023', '2023', '11'], ['时间', '2022', '2022', '10'], ['时间', '2021', '2021', '9'],
['时间', '2019', '2019', '8'],
['时间', '2018', '2018', '7'], ['时间', '2017', '2017', '6'], ['时间', '2016', '2016', '5'],
['时间', '2015', '2015', '4'],
['时间', '2014', '2014', '3'], ['时间', '2013', '2013', '2'], ['时间', '2012', '2012', '1'],
['领域', '经济', '101', '18'], ['领域', '政治', '102', '21'], ['领域', '文化', '103', '14'], ['领域', '社会', '104', '15'],
['领域', '生态', '105', '7'], ['领域', '党建', '106', '9'], ['领域', '国防', '107', '6'], ['领域', '外交', '108', '50'],
]
dic_lables = {
"类型|讲话": "important_speech_type|1700334917807710209",
"类型|会议": "important_speech_type|1700334936166178818",
"类型|活动": "important_speech_type|1700334960560250881",
"类型|考察": "important_speech_type|1700334978285379585",
"类型|会见": "important_speech_type|1700335044605714433",
"类型|出访": "important_speech_type|1700335078852206593",
"类型|函电": "important_speech_type|1700335099689508866",
"类型|其他": "important_speech_type|1700335118056366082",
"时间|2012": "important_speech_time|1700334545970077697",
"时间|2013": "important_speech_time|1700334647757447170",
"时间|2014": "important_speech_time|1700334667915272194",
"时间|2015": "important_speech_time|1700334686550564865",
"时间|2016": "important_speech_time|1700334704925810689",
"时间|2017": "important_speech_time|1700334722529304578",
"时间|2018": "important_speech_time|1700334738320859137",
"时间|2019": "important_speech_time|1700334758302523393",
"时间|2020": "important_speech_time|1700334777827008514",
"时间|2021": "important_speech_time|1700334797477322753",
"时间|2022": "important_speech_time|1700334814468448258",
"时间|2023": "important_speech_time|1700334832495566850",
"领域|经济": "important_speech_area|1700335225803841537",
"领域|政治": "important_speech_area|1700335248096567297",
"领域|文化": "important_speech_area|1700335379638329345",
"领域|社会": "important_speech_area|1700335412873994242",
"领域|生态": "important_speech_area|1700335541211308033",
"领域|党建": "important_speech_area|1700335587780665346",
"领域|国防": "important_speech_area|1700335615895085058",
"领域|外交": "important_speech_area|1700335820430319618",
}
for result_list in result_lists:
sort = result_list[0]
sort_text = result_list[1]
......@@ -96,13 +234,19 @@ def get_content():
if sort == '类型':
form = result_list[2]
type_ = '0'
year = '0'
elif sort == '时间':
form = '0'
type_ = '0'
year = result_list[2]
else:
form = '0'
type_ = result_list[2]
year = '0'
# total_page = result_list[3]
total_page = 2
total_page = 10
for page in range(1, int(total_page)):
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year={year}&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
payload = {}
try:
resp_json = requests.request("GET", url, headers=headers, verify=False, data=payload).json()
......@@ -122,52 +266,24 @@ def get_content():
type_lists = [type_dict]
else:
type_lists = type_list + [type_dict]
art_type_dict[article_id] = type_lists
for key, value in art_content_dict.items():
tags = art_type_dict.get(key)
if tags is None:
tags = []
value['tags'] = tags
post_dict = value
db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
if post_dict['is_repeat'] == '1':
continue
try:
del post_dict['is_repeat']
# labels = []
# for tags_dict in post_dict['tags']:
# labels_dict = {
# 'abelRemarks': tags_dict.get('type'),
# 'relationName': tags_dict.get('name'),
# }
# labels.append(labels_dict)
# aaa_dict = {
# 'sid': '1533647545473859586',
# 'title': post_dict['title'],
# 'content': '',
# 'contentWithTag': post_dict['content'],
# 'summary': '',
# 'author': '',
# 'origin': post_dict['origin'],
# 'publishDate': post_dict['publishDate'],
# 'sourceAddress': post_dict['sourceAddress'],
# 'labels': labels
# }
post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
headers = {'Content-Type': 'application/json'}
resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(post_dict)).json()
print('推送:', resp_json['msg'])
except:
print('数据传接口失败,正在重试!')
time.sleep(5)
db_storage.delete_one({'id': post_dict['id']})
continue
new_lst = []
# 遍历原列表的字典元素
for item in type_lists:
# 如果字典元素不在新列表中,则添加到新列表中
if item not in new_lst:
new_lst.append(item)
art_type_dict[article_id] = new_lst
newsdata(art_content_dict, art_type_dict, dic_lables)
if __name__ == '__main__':
try:
get_content()
except Exception as e:
print(e)
pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论