Merge remote-tracking branch 'origin/master'

b2dd89c5 · LiuLiYuan · d7b3c3cf · 4f718511 · b2dd89c5
--- a/习近平讲话/1.py
+++ b/习近平讲话/1.py
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Author: MENG
 # @Time  : 2022-3-18
+import redis
 import requests
+from langid import langid
 from pyquery import PyQuery as pq
 import time
 import json
 import pymongo
+from kafka import KafkaProducer
 from requests.packages import urllib3
 urllib3.disable_warnings()
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['人民网-习讲话数据库_copy']
+def newsdata(art_content_dict,art_type_dict,dic_lables):
+    for key, value in art_content_dict.items():
+        labels = []
+        tags = art_type_dict.get(key)
+        if tags is None:
+            tags = []
+        value_new = value
+        value_new['tags'] = tags
+        # todo:lables映射
+        for tag in tags:
+            labelRemarks = tag['type']
+            relationName = tag['name']
+            item = labelRemarks + "|" + relationName
+            item_value = dic_lables[item]
+            labelMark = item_value.split("|")[0]
+            relationId = item_value.split("|")[1]
+            label = {
+                "labelMark": labelMark,
+                "labelRemarks": labelRemarks,
+                "relationId": relationId,
+                "relationName": relationName
+            }
+            labels.append(label)
+        value_new['labels'] = labels
+        value_new['subjectId'] = "1534423014825668610"
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        value_new['createDate'] = time_now
+        value_new['checkStatus'] = "1"
+        value_new['deleteFlag'] = "0"
+        value_new['topNum'] = "0"
+        value_new['summary'] = ""
+        post_dict = value_new
+        for i in range(5):
+            try:
+                db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
+                break
+            except:
+                time.sleep(2)
+                continue
+        if post_dict['is_repeat'] == '1':
+            continue
+        try:
+            del post_dict['is_repeat']
+            del post_dict['tags']
+            # 发送kafka
+            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+            kafka_result = producer.send("research_center_fourth",
+                                         json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
+            print(kafka_result.get(timeout=10))
+            dic_result = {
+                'success': 'ture',
+                'message': '操作成功',
+                'code': '200',
+            }
+            print(dic_result)
+            old_dic = post_dict
+            try:
+                del old_dic['contentWithTag']
+                del old_dic['lang']
+                del old_dic['labels']
+                del old_dic['createDate']
+                del old_dic['checkStatus']
+                del old_dic['deleteFlag']
+                del old_dic['topNum']
+                del old_dic['summary']
+                # post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
+                # headers = {'Content-Type': 'application/json'}
+                # resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(old_dic)).json()
+                # print('推送：', resp_json['msg'])
+            except:
+                print('数据传接口失败，正在重试！')
+        except Exception as e:
+            dic_result = {
+                'success': 'false',
+                'message': '操作失败',
+                'code': '204',
+                'e': e
+            }
+            print(dic_result)
+            time.sleep(5)
+            db_storage.delete_one({'id': post_dict['id']})
+            continue
 # 习讲话数据库 新增数据
 def get_content():
@@ -23,7 +112,7 @@ def get_content():
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
    }
-    for page in range(3, 0, -1):
+    for page in range(9, 0, -1):
        url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
        try:
            resp_json = requests.request("GET", url, headers=headers, verify=False).json()
@@ -39,6 +128,8 @@ def get_content():
                continue
            title = data_dict['title']
            pub_time = data_dict['input_date']
+            if pub_time <= '2023-11-06':
+                continue
            title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
            is_repeat = ''
            for title_dict in title_dict_list:
@@ -60,19 +151,28 @@ def get_content():
                continue
            content_html = content_html1 + '\n' + content_html2
            content = pq(content_html).text()
+            lang = langid.classify(content)
+            if lang == '':
+                lang = 'cn'
+            if lang[0] == '':
+                lang = 'cn'
+            else:
+                lang = lang[0]
            if content.strip() == '':
                print(href, '内容为空')
                continue
            origin = data_dict['origin_name']
            a_dict = {
-                'id': article_id,
+                'id': "1534423014825668610" + article_id,
                'title': title,
                'author': '',
                'origin': origin,
-                'content': content_html,
+                'contentWithTag': content_html,
+                'content': content,
                'publishDate': pub_time,
                'sourceAddress': href,
                'tags': [],
+                'lang': lang,
                'is_repeat': is_repeat
            }
            art_content_dict[article_id] = a_dict
@@ -86,9 +186,47 @@ def get_content():
    result_lists = [
        ['类型', '讲话', '706', '69'], ['类型', '会议', '701', '178'], ['类型', '活动', '702', '63'], ['类型', '考察', '703', '72'],
        ['类型', '会见', '704', '174'], ['类型', '出访', '705', '188'], ['类型', '函电', '707', '194'], ['类型', '其他', '708', '203'],
+        ['时间', '2023', '2023', '11'], ['时间', '2022', '2022', '10'], ['时间', '2021', '2021', '9'],
+        ['时间', '2019', '2019', '8'],
+        ['时间', '2018', '2018', '7'], ['时间', '2017', '2017', '6'], ['时间', '2016', '2016', '5'],
+        ['时间', '2015', '2015', '4'],
+        ['时间', '2014', '2014', '3'], ['时间', '2013', '2013', '2'], ['时间', '2012', '2012', '1'],
        ['领域', '经济', '101', '18'], ['领域', '政治', '102', '21'], ['领域', '文化', '103', '14'], ['领域', '社会', '104', '15'],
        ['领域', '生态', '105', '7'], ['领域', '党建', '106', '9'], ['领域', '国防', '107', '6'], ['领域', '外交', '108', '50'],
    ]
+    dic_lables = {
+        "类型|讲话": "important_speech_type|1700334917807710209",
+        "类型|会议": "important_speech_type|1700334936166178818",
+        "类型|活动": "important_speech_type|1700334960560250881",
+        "类型|考察": "important_speech_type|1700334978285379585",
+        "类型|会见": "important_speech_type|1700335044605714433",
+        "类型|出访": "important_speech_type|1700335078852206593",
+        "类型|函电": "important_speech_type|1700335099689508866",
+        "类型|其他": "important_speech_type|1700335118056366082",
+        "时间|2012": "important_speech_time|1700334545970077697",
+        "时间|2013": "important_speech_time|1700334647757447170",
+        "时间|2014": "important_speech_time|1700334667915272194",
+        "时间|2015": "important_speech_time|1700334686550564865",
+        "时间|2016": "important_speech_time|1700334704925810689",
+        "时间|2017": "important_speech_time|1700334722529304578",
+        "时间|2018": "important_speech_time|1700334738320859137",
+        "时间|2019": "important_speech_time|1700334758302523393",
+        "时间|2020": "important_speech_time|1700334777827008514",
+        "时间|2021": "important_speech_time|1700334797477322753",
+        "时间|2022": "important_speech_time|1700334814468448258",
+        "时间|2023": "important_speech_time|1700334832495566850",
+        "领域|经济": "important_speech_area|1700335225803841537",
+        "领域|政治": "important_speech_area|1700335248096567297",
+        "领域|文化": "important_speech_area|1700335379638329345",
+        "领域|社会": "important_speech_area|1700335412873994242",
+        "领域|生态": "important_speech_area|1700335541211308033",
+        "领域|党建": "important_speech_area|1700335587780665346",
+        "领域|国防": "important_speech_area|1700335615895085058",
+        "领域|外交": "important_speech_area|1700335820430319618",
+    }
    for result_list in result_lists:
        sort = result_list[0]
        sort_text = result_list[1]
@@ -96,13 +234,19 @@ def get_content():
        if sort == '类型':
            form = result_list[2]
            type_ = '0'
+            year = '0'
+        elif sort == '时间':
+            form = '0'
+            type_ = '0'
+            year = result_list[2]
        else:
            form = '0'
            type_ = result_list[2]
+            year = '0'
        # total_page = result_list[3]
-        total_page = 2
+        total_page = 10
        for page in range(1, int(total_page)):
-            url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
+            url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year={year}&form={form}&type={type_}&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
            payload = {}
            try:
                resp_json = requests.request("GET", url, headers=headers, verify=False, data=payload).json()
@@ -122,52 +266,24 @@ def get_content():
                        type_lists = [type_dict]
                    else:
                        type_lists = type_list + [type_dict]
-                    art_type_dict[article_id] = type_lists
+                    new_lst = []
-    for key, value in art_content_dict.items():
-        tags = art_type_dict.get(key)
+                    # 遍历原列表的字典元素
-        if tags is None:
+                    for item in type_lists:
-            tags = []
+                        # 如果字典元素不在新列表中，则添加到新列表中
-        value['tags'] = tags
+                        if item not in new_lst:
-        post_dict = value
+                            new_lst.append(item)
-        db_storage.update_one({'id': post_dict['id']}, {'$set': {'tags': tags}})
-        if post_dict['is_repeat'] == '1':
+                    art_type_dict[article_id] = new_lst
-            continue
-        try:
+    newsdata(art_content_dict, art_type_dict, dic_lables)
-            del post_dict['is_repeat']
-            # labels = []
-            # for tags_dict in post_dict['tags']:
-            #     labels_dict = {
-            #         'abelRemarks': tags_dict.get('type'),
-            #         'relationName': tags_dict.get('name'),
-            #     }
-            #     labels.append(labels_dict)
-            # aaa_dict = {
-            #     'sid': '1533647545473859586',
-            #     'title': post_dict['title'],
-            #     'content': '',
-            #     'contentWithTag': post_dict['content'],
-            #     'summary': '',
-            #     'author': '',
-            #     'origin': post_dict['origin'],
-            #     'publishDate': post_dict['publishDate'],
-            #     'sourceAddress': post_dict['sourceAddress'],
-            #     'labels': labels
-            # }
-            post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
-            headers = {'Content-Type': 'application/json'}
-            resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(post_dict)).json()
-            print('推送：', resp_json['msg'])
-        except:
-            print('数据传接口失败，正在重试！')
-            time.sleep(5)
-            db_storage.delete_one({'id': post_dict['id']})
-            continue
 if __name__ == '__main__':
    try:
        get_content()
    except Exception as e:
+        print(e)
        pass