提交 61eecac3 作者: 薛凌堃

习近平讲话

上级 d53906a2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -10,8 +10,9 @@ import json ...@@ -10,8 +10,9 @@ import json
import pymongo import pymongo
from kafka import KafkaProducer from kafka import KafkaProducer
from requests.packages import urllib3 from requests.packages import urllib3
from datetime import datetime, timedelta
urllib3.disable_warnings() urllib3.disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['人民网-习讲话数据库_copy'] db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').ZZSN['人民网-习讲话数据库_copy']
def newsdata(art_content_dict,art_type_dict,dic_lables): def newsdata(art_content_dict,art_type_dict,dic_lables):
for key, value in art_content_dict.items(): for key, value in art_content_dict.items():
...@@ -112,7 +113,7 @@ def get_content(): ...@@ -112,7 +113,7 @@ def get_content():
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg' 'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
} }
for page in range(9, 0, -1): for page in range(1, 0, -1):
url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2" url = f"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=%E5%85%A8%E9%83%A8&source=2"
try: try:
resp_json = requests.request("GET", url, headers=headers, verify=False).json() resp_json = requests.request("GET", url, headers=headers, verify=False).json()
...@@ -128,7 +129,11 @@ def get_content(): ...@@ -128,7 +129,11 @@ def get_content():
continue continue
title = data_dict['title'] title = data_dict['title']
pub_time = data_dict['input_date'] pub_time = data_dict['input_date']
if pub_time <= '2023-11-06': current_date = datetime.now()
yesterday = current_date - timedelta(days=1)
# 格式化日期
yesterday_date = yesterday.strftime("%Y-%m-%d")
if pub_time <= yesterday_date:
continue continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''}) title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
is_repeat = '' is_repeat = ''
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论