Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
4f718511
提交
4f718511
authored
12月 01, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
习近平讲话
上级
7ef6f432
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
162 行增加
和
46 行删除
+162
-46
1.py
习近平讲话/1.py
+162
-46
没有找到文件。
习近平讲话/1.py
浏览文件 @
4f718511
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-3-18
import
redis
import
requests
from
langid
import
langid
from
pyquery
import
PyQuery
as
pq
import
time
import
json
import
pymongo
from
kafka
import
KafkaProducer
from
requests.packages
import
urllib3
urllib3
.
disable_warnings
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'人民网-习讲话数据库_copy'
]
def
newsdata
(
art_content_dict
,
art_type_dict
,
dic_lables
):
for
key
,
value
in
art_content_dict
.
items
():
labels
=
[]
tags
=
art_type_dict
.
get
(
key
)
if
tags
is
None
:
tags
=
[]
value_new
=
value
value_new
[
'tags'
]
=
tags
# todo:lables映射
for
tag
in
tags
:
labelRemarks
=
tag
[
'type'
]
relationName
=
tag
[
'name'
]
item
=
labelRemarks
+
"|"
+
relationName
item_value
=
dic_lables
[
item
]
labelMark
=
item_value
.
split
(
"|"
)[
0
]
relationId
=
item_value
.
split
(
"|"
)[
1
]
label
=
{
"labelMark"
:
labelMark
,
"labelRemarks"
:
labelRemarks
,
"relationId"
:
relationId
,
"relationName"
:
relationName
}
labels
.
append
(
label
)
value_new
[
'labels'
]
=
labels
value_new
[
'subjectId'
]
=
"1534423014825668610"
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
value_new
[
'createDate'
]
=
time_now
value_new
[
'checkStatus'
]
=
"1"
value_new
[
'deleteFlag'
]
=
"0"
value_new
[
'topNum'
]
=
"0"
value_new
[
'summary'
]
=
""
post_dict
=
value_new
for
i
in
range
(
5
):
try
:
db_storage
.
update_one
({
'id'
:
post_dict
[
'id'
]},
{
'$set'
:
{
'tags'
:
tags
}})
break
except
:
time
.
sleep
(
2
)
continue
if
post_dict
[
'is_repeat'
]
==
'1'
:
continue
try
:
del
post_dict
[
'is_repeat'
]
del
post_dict
[
'tags'
]
# 发送kafka
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
json
.
dumps
(
post_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
print
(
dic_result
)
old_dic
=
post_dict
try
:
del
old_dic
[
'contentWithTag'
]
del
old_dic
[
'lang'
]
del
old_dic
[
'labels'
]
del
old_dic
[
'createDate'
]
del
old_dic
[
'checkStatus'
]
del
old_dic
[
'deleteFlag'
]
del
old_dic
[
'topNum'
]
del
old_dic
[
'summary'
]
# post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
# headers = {'Content-Type': 'application/json'}
# resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(old_dic)).json()
# print('推送:', resp_json['msg'])
except
:
print
(
'数据传接口失败,正在重试!'
)
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
print
(
dic_result
)
time
.
sleep
(
5
)
db_storage
.
delete_one
({
'id'
:
post_dict
[
'id'
]})
continue
# 习讲话数据库 新增数据
def
get_content
():
...
...
@@ -23,7 +112,7 @@ def get_content():
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
}
for
page
in
range
(
3
,
0
,
-
1
):
for
page
in
range
(
9
,
0
,
-
1
):
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
try
:
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
...
...
@@ -39,6 +128,8 @@ def get_content():
continue
title
=
data_dict
[
'title'
]
pub_time
=
data_dict
[
'input_date'
]
if
pub_time
<=
'2023-11-06'
:
continue
title_dict_list
=
db_storage
.
find
({
'title'
:
title
,
'is_repeat'
:
''
})
is_repeat
=
''
for
title_dict
in
title_dict_list
:
...
...
@@ -60,19 +151,28 @@ def get_content():
continue
content_html
=
content_html1
+
'
\n
'
+
content_html2
content
=
pq
(
content_html
)
.
text
()
lang
=
langid
.
classify
(
content
)
if
lang
==
''
:
lang
=
'cn'
if
lang
[
0
]
==
''
:
lang
=
'cn'
else
:
lang
=
lang
[
0
]
if
content
.
strip
()
==
''
:
print
(
href
,
'内容为空'
)
continue
origin
=
data_dict
[
'origin_name'
]
a_dict
=
{
'id'
:
article_id
,
'id'
:
"1534423014825668610"
+
article_id
,
'title'
:
title
,
'author'
:
''
,
'origin'
:
origin
,
'content'
:
content_html
,
'contentWithTag'
:
content_html
,
'content'
:
content
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
'tags'
:
[],
'lang'
:
lang
,
'is_repeat'
:
is_repeat
}
art_content_dict
[
article_id
]
=
a_dict
...
...
@@ -86,9 +186,47 @@ def get_content():
result_lists
=
[
[
'类型'
,
'讲话'
,
'706'
,
'69'
],
[
'类型'
,
'会议'
,
'701'
,
'178'
],
[
'类型'
,
'活动'
,
'702'
,
'63'
],
[
'类型'
,
'考察'
,
'703'
,
'72'
],
[
'类型'
,
'会见'
,
'704'
,
'174'
],
[
'类型'
,
'出访'
,
'705'
,
'188'
],
[
'类型'
,
'函电'
,
'707'
,
'194'
],
[
'类型'
,
'其他'
,
'708'
,
'203'
],
[
'时间'
,
'2023'
,
'2023'
,
'11'
],
[
'时间'
,
'2022'
,
'2022'
,
'10'
],
[
'时间'
,
'2021'
,
'2021'
,
'9'
],
[
'时间'
,
'2019'
,
'2019'
,
'8'
],
[
'时间'
,
'2018'
,
'2018'
,
'7'
],
[
'时间'
,
'2017'
,
'2017'
,
'6'
],
[
'时间'
,
'2016'
,
'2016'
,
'5'
],
[
'时间'
,
'2015'
,
'2015'
,
'4'
],
[
'时间'
,
'2014'
,
'2014'
,
'3'
],
[
'时间'
,
'2013'
,
'2013'
,
'2'
],
[
'时间'
,
'2012'
,
'2012'
,
'1'
],
[
'领域'
,
'经济'
,
'101'
,
'18'
],
[
'领域'
,
'政治'
,
'102'
,
'21'
],
[
'领域'
,
'文化'
,
'103'
,
'14'
],
[
'领域'
,
'社会'
,
'104'
,
'15'
],
[
'领域'
,
'生态'
,
'105'
,
'7'
],
[
'领域'
,
'党建'
,
'106'
,
'9'
],
[
'领域'
,
'国防'
,
'107'
,
'6'
],
[
'领域'
,
'外交'
,
'108'
,
'50'
],
]
dic_lables
=
{
"类型|讲话"
:
"important_speech_type|1700334917807710209"
,
"类型|会议"
:
"important_speech_type|1700334936166178818"
,
"类型|活动"
:
"important_speech_type|1700334960560250881"
,
"类型|考察"
:
"important_speech_type|1700334978285379585"
,
"类型|会见"
:
"important_speech_type|1700335044605714433"
,
"类型|出访"
:
"important_speech_type|1700335078852206593"
,
"类型|函电"
:
"important_speech_type|1700335099689508866"
,
"类型|其他"
:
"important_speech_type|1700335118056366082"
,
"时间|2012"
:
"important_speech_time|1700334545970077697"
,
"时间|2013"
:
"important_speech_time|1700334647757447170"
,
"时间|2014"
:
"important_speech_time|1700334667915272194"
,
"时间|2015"
:
"important_speech_time|1700334686550564865"
,
"时间|2016"
:
"important_speech_time|1700334704925810689"
,
"时间|2017"
:
"important_speech_time|1700334722529304578"
,
"时间|2018"
:
"important_speech_time|1700334738320859137"
,
"时间|2019"
:
"important_speech_time|1700334758302523393"
,
"时间|2020"
:
"important_speech_time|1700334777827008514"
,
"时间|2021"
:
"important_speech_time|1700334797477322753"
,
"时间|2022"
:
"important_speech_time|1700334814468448258"
,
"时间|2023"
:
"important_speech_time|1700334832495566850"
,
"领域|经济"
:
"important_speech_area|1700335225803841537"
,
"领域|政治"
:
"important_speech_area|1700335248096567297"
,
"领域|文化"
:
"important_speech_area|1700335379638329345"
,
"领域|社会"
:
"important_speech_area|1700335412873994242"
,
"领域|生态"
:
"important_speech_area|1700335541211308033"
,
"领域|党建"
:
"important_speech_area|1700335587780665346"
,
"领域|国防"
:
"important_speech_area|1700335615895085058"
,
"领域|外交"
:
"important_speech_area|1700335820430319618"
,
}
for
result_list
in
result_lists
:
sort
=
result_list
[
0
]
sort_text
=
result_list
[
1
]
...
...
@@ -96,13 +234,19 @@ def get_content():
if
sort
==
'类型'
:
form
=
result_list
[
2
]
type_
=
'0'
year
=
'0'
elif
sort
==
'时间'
:
form
=
'0'
type_
=
'0'
year
=
result_list
[
2
]
else
:
form
=
'0'
type_
=
result_list
[
2
]
year
=
'0'
# total_page = result_list[3]
total_page
=
2
total_page
=
10
for
page
in
range
(
1
,
int
(
total_page
)):
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=
0
&form={form}&type={type_}&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=
{year}
&form={form}&type={type_}&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
payload
=
{}
try
:
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
,
data
=
payload
)
.
json
()
...
...
@@ -122,52 +266,24 @@ def get_content():
type_lists
=
[
type_dict
]
else
:
type_lists
=
type_list
+
[
type_dict
]
art_type_dict
[
article_id
]
=
type_lists
for
key
,
value
in
art_content_dict
.
items
():
tags
=
art_type_dict
.
get
(
key
)
if
tags
is
None
:
tags
=
[]
value
[
'tags'
]
=
tags
post_dict
=
value
db_storage
.
update_one
({
'id'
:
post_dict
[
'id'
]},
{
'$set'
:
{
'tags'
:
tags
}})
if
post_dict
[
'is_repeat'
]
==
'1'
:
continue
try
:
del
post_dict
[
'is_repeat'
]
# labels = []
# for tags_dict in post_dict['tags']:
# labels_dict = {
# 'abelRemarks': tags_dict.get('type'),
# 'relationName': tags_dict.get('name'),
# }
# labels.append(labels_dict)
# aaa_dict = {
# 'sid': '1533647545473859586',
# 'title': post_dict['title'],
# 'content': '',
# 'contentWithTag': post_dict['content'],
# 'summary': '',
# 'author': '',
# 'origin': post_dict['origin'],
# 'publishDate': post_dict['publishDate'],
# 'sourceAddress': post_dict['sourceAddress'],
# 'labels': labels
# }
post_url
=
'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
headers
=
{
'Content-Type'
:
'application/json'
}
resp_json
=
requests
.
post
(
url
=
post_url
,
headers
=
headers
,
verify
=
False
,
data
=
json
.
dumps
(
post_dict
))
.
json
()
print
(
'推送:'
,
resp_json
[
'msg'
])
except
:
print
(
'数据传接口失败,正在重试!'
)
time
.
sleep
(
5
)
db_storage
.
delete_one
({
'id'
:
post_dict
[
'id'
]})
continue
new_lst
=
[]
# 遍历原列表的字典元素
for
item
in
type_lists
:
# 如果字典元素不在新列表中,则添加到新列表中
if
item
not
in
new_lst
:
new_lst
.
append
(
item
)
art_type_dict
[
article_id
]
=
new_lst
newsdata
(
art_content_dict
,
art_type_dict
,
dic_lables
)
if
__name__
==
'__main__'
:
try
:
get_content
()
except
Exception
as
e
:
print
(
e
)
pass
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论