习近平讲话脚本调整

27e85f48 · 薛凌堃 · 8bec5313 · 27e85f48
--- a/习近平讲话/1.py
+++ b/习近平讲话/1.py
 # -*- coding: utf-8 -*-
@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables):
        try:
            del post_dict['is_repeat']
            del post_dict['tags']
+            del post_dict['title_pd']
            # 发送kafka
-            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+            # producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
-            kafka_result = producer.send("research_center_fourth",
+            # kafka_result = producer.send("research_center_fourth",
-                                         json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
+            #                              json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
+            #
-            print(kafka_result.get(timeout=10))
+            # print(kafka_result.get(timeout=10))
            dic_result = {
                'success': 'ture',
@@ -122,21 +123,22 @@ def get_content():
        except:
            print('请求错误1')
            continue
-        for data_dict in data_list[::-1]:
+        # for data_dict in data_list[::-1]:
+        for data_dict in data_list[:1]:
            article_id = data_dict['article_id']
            print(type(article_id))
-            is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
+            # is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
-            if is_article_id:
+            # if is_article_id:
-                continue
+            #     continue
-            title = data_dict['title']
+            title = data_dict['title'] # 采集到的标题
            pub_time = data_dict['input_date']
            current_date = datetime.now()
            yesterday = current_date - timedelta(days=1)
            # 格式化日期
            yesterday_date = yesterday.strftime("%Y-%m-%d")
-            if pub_time <= yesterday_date:
+            # if pub_time <= yesterday_date:
-                continue
+            #     continue
-            title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
+            title_dict_list = db_storage.find({'title_pd': title.replace(' ', ''), 'is_repeat': ''})  # 如果找到一样的标题 判断三天之内是否有重复的
            is_repeat = ''
            for title_dict in title_dict_list:
                pub_time1 = title_dict['publishDate']
@@ -152,6 +154,14 @@ def get_content():
                doc_href = pq(href_text)
                content_html1 = str(doc_href('.d2txt_con.clearfix'))
                content_html2 = str(doc_href('.editor.clearfix'))
+                #rtodo: 找到标题并拼接
+                title1 = doc_href('.d2txt.clearfix h2').text()
+                title2 = doc_href('.d2txt.clearfix h1').text()
+                title3 = doc_href('.d2txt.clearfix h3').text()
+                if title1 == '' and title3 == '':
+                    title_final = title
+                else:
+                    title_final = title1 + ' ' + title2 + ' ' + title3
            except:
                print('请求错误2')
                continue
@@ -170,7 +180,8 @@ def get_content():
            origin = data_dict['origin_name']
            a_dict = {
                'id': "1534423014825668610" + article_id,
-                'title': title,
+                'title': title_final,
+                'title_pd': title,
                'author': '',
                'origin': origin,
                'contentWithTag': content_html,
@@ -183,6 +194,7 @@ def get_content():
            }
            art_content_dict[article_id] = a_dict
            db_a_dict = a_dict.copy()
+            db_a_dict['title_pd'] = title.replace(' ', '')
            db_storage.insert_one(db_a_dict)
            if is_repeat == '':
                print(href)