Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
b2dd89c5
提交
b2dd89c5
authored
12月 02, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
d7b3c3cf
4f718511
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
162 行增加
和
46 行删除
+162
-46
1.py
习近平讲话/1.py
+162
-46
没有找到文件。
习近平讲话/1.py
浏览文件 @
b2dd89c5
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author: MENG
# @Author: MENG
# @Time : 2022-3-18
# @Time : 2022-3-18
import
redis
import
requests
import
requests
from
langid
import
langid
from
pyquery
import
PyQuery
as
pq
from
pyquery
import
PyQuery
as
pq
import
time
import
time
import
json
import
json
import
pymongo
import
pymongo
from
kafka
import
KafkaProducer
from
requests.packages
import
urllib3
from
requests.packages
import
urllib3
urllib3
.
disable_warnings
()
urllib3
.
disable_warnings
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'人民网-习讲话数据库_copy'
]
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'人民网-习讲话数据库_copy'
]
def
newsdata
(
art_content_dict
,
art_type_dict
,
dic_lables
):
for
key
,
value
in
art_content_dict
.
items
():
labels
=
[]
tags
=
art_type_dict
.
get
(
key
)
if
tags
is
None
:
tags
=
[]
value_new
=
value
value_new
[
'tags'
]
=
tags
# todo:lables映射
for
tag
in
tags
:
labelRemarks
=
tag
[
'type'
]
relationName
=
tag
[
'name'
]
item
=
labelRemarks
+
"|"
+
relationName
item_value
=
dic_lables
[
item
]
labelMark
=
item_value
.
split
(
"|"
)[
0
]
relationId
=
item_value
.
split
(
"|"
)[
1
]
label
=
{
"labelMark"
:
labelMark
,
"labelRemarks"
:
labelRemarks
,
"relationId"
:
relationId
,
"relationName"
:
relationName
}
labels
.
append
(
label
)
value_new
[
'labels'
]
=
labels
value_new
[
'subjectId'
]
=
"1534423014825668610"
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
value_new
[
'createDate'
]
=
time_now
value_new
[
'checkStatus'
]
=
"1"
value_new
[
'deleteFlag'
]
=
"0"
value_new
[
'topNum'
]
=
"0"
value_new
[
'summary'
]
=
""
post_dict
=
value_new
for
i
in
range
(
5
):
try
:
db_storage
.
update_one
({
'id'
:
post_dict
[
'id'
]},
{
'$set'
:
{
'tags'
:
tags
}})
break
except
:
time
.
sleep
(
2
)
continue
if
post_dict
[
'is_repeat'
]
==
'1'
:
continue
try
:
del
post_dict
[
'is_repeat'
]
del
post_dict
[
'tags'
]
# 发送kafka
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
json
.
dumps
(
post_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
print
(
dic_result
)
old_dic
=
post_dict
try
:
del
old_dic
[
'contentWithTag'
]
del
old_dic
[
'lang'
]
del
old_dic
[
'labels'
]
del
old_dic
[
'createDate'
]
del
old_dic
[
'checkStatus'
]
del
old_dic
[
'deleteFlag'
]
del
old_dic
[
'topNum'
]
del
old_dic
[
'summary'
]
# post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
# headers = {'Content-Type': 'application/json'}
# resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(old_dic)).json()
# print('推送:', resp_json['msg'])
except
:
print
(
'数据传接口失败,正在重试!'
)
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
print
(
dic_result
)
time
.
sleep
(
5
)
db_storage
.
delete_one
({
'id'
:
post_dict
[
'id'
]})
continue
# 习讲话数据库 新增数据
# 习讲话数据库 新增数据
def
get_content
():
def
get_content
():
...
@@ -23,7 +112,7 @@ def get_content():
...
@@ -23,7 +112,7 @@ def get_content():
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
'Cookie'
:
'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
}
}
for
page
in
range
(
3
,
0
,
-
1
):
for
page
in
range
(
9
,
0
,
-
1
):
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=0&form=0&type=0&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
try
:
try
:
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
...
@@ -39,6 +128,8 @@ def get_content():
...
@@ -39,6 +128,8 @@ def get_content():
continue
continue
title
=
data_dict
[
'title'
]
title
=
data_dict
[
'title'
]
pub_time
=
data_dict
[
'input_date'
]
pub_time
=
data_dict
[
'input_date'
]
if
pub_time
<=
'2023-11-06'
:
continue
title_dict_list
=
db_storage
.
find
({
'title'
:
title
,
'is_repeat'
:
''
})
title_dict_list
=
db_storage
.
find
({
'title'
:
title
,
'is_repeat'
:
''
})
is_repeat
=
''
is_repeat
=
''
for
title_dict
in
title_dict_list
:
for
title_dict
in
title_dict_list
:
...
@@ -60,19 +151,28 @@ def get_content():
...
@@ -60,19 +151,28 @@ def get_content():
continue
continue
content_html
=
content_html1
+
'
\n
'
+
content_html2
content_html
=
content_html1
+
'
\n
'
+
content_html2
content
=
pq
(
content_html
)
.
text
()
content
=
pq
(
content_html
)
.
text
()
lang
=
langid
.
classify
(
content
)
if
lang
==
''
:
lang
=
'cn'
if
lang
[
0
]
==
''
:
lang
=
'cn'
else
:
lang
=
lang
[
0
]
if
content
.
strip
()
==
''
:
if
content
.
strip
()
==
''
:
print
(
href
,
'内容为空'
)
print
(
href
,
'内容为空'
)
continue
continue
origin
=
data_dict
[
'origin_name'
]
origin
=
data_dict
[
'origin_name'
]
a_dict
=
{
a_dict
=
{
'id'
:
article_id
,
'id'
:
"1534423014825668610"
+
article_id
,
'title'
:
title
,
'title'
:
title
,
'author'
:
''
,
'author'
:
''
,
'origin'
:
origin
,
'origin'
:
origin
,
'content'
:
content_html
,
'contentWithTag'
:
content_html
,
'content'
:
content
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'tags'
:
[],
'tags'
:
[],
'lang'
:
lang
,
'is_repeat'
:
is_repeat
'is_repeat'
:
is_repeat
}
}
art_content_dict
[
article_id
]
=
a_dict
art_content_dict
[
article_id
]
=
a_dict
...
@@ -86,9 +186,47 @@ def get_content():
...
@@ -86,9 +186,47 @@ def get_content():
result_lists
=
[
result_lists
=
[
[
'类型'
,
'讲话'
,
'706'
,
'69'
],
[
'类型'
,
'会议'
,
'701'
,
'178'
],
[
'类型'
,
'活动'
,
'702'
,
'63'
],
[
'类型'
,
'考察'
,
'703'
,
'72'
],
[
'类型'
,
'讲话'
,
'706'
,
'69'
],
[
'类型'
,
'会议'
,
'701'
,
'178'
],
[
'类型'
,
'活动'
,
'702'
,
'63'
],
[
'类型'
,
'考察'
,
'703'
,
'72'
],
[
'类型'
,
'会见'
,
'704'
,
'174'
],
[
'类型'
,
'出访'
,
'705'
,
'188'
],
[
'类型'
,
'函电'
,
'707'
,
'194'
],
[
'类型'
,
'其他'
,
'708'
,
'203'
],
[
'类型'
,
'会见'
,
'704'
,
'174'
],
[
'类型'
,
'出访'
,
'705'
,
'188'
],
[
'类型'
,
'函电'
,
'707'
,
'194'
],
[
'类型'
,
'其他'
,
'708'
,
'203'
],
[
'时间'
,
'2023'
,
'2023'
,
'11'
],
[
'时间'
,
'2022'
,
'2022'
,
'10'
],
[
'时间'
,
'2021'
,
'2021'
,
'9'
],
[
'时间'
,
'2019'
,
'2019'
,
'8'
],
[
'时间'
,
'2018'
,
'2018'
,
'7'
],
[
'时间'
,
'2017'
,
'2017'
,
'6'
],
[
'时间'
,
'2016'
,
'2016'
,
'5'
],
[
'时间'
,
'2015'
,
'2015'
,
'4'
],
[
'时间'
,
'2014'
,
'2014'
,
'3'
],
[
'时间'
,
'2013'
,
'2013'
,
'2'
],
[
'时间'
,
'2012'
,
'2012'
,
'1'
],
[
'领域'
,
'经济'
,
'101'
,
'18'
],
[
'领域'
,
'政治'
,
'102'
,
'21'
],
[
'领域'
,
'文化'
,
'103'
,
'14'
],
[
'领域'
,
'社会'
,
'104'
,
'15'
],
[
'领域'
,
'经济'
,
'101'
,
'18'
],
[
'领域'
,
'政治'
,
'102'
,
'21'
],
[
'领域'
,
'文化'
,
'103'
,
'14'
],
[
'领域'
,
'社会'
,
'104'
,
'15'
],
[
'领域'
,
'生态'
,
'105'
,
'7'
],
[
'领域'
,
'党建'
,
'106'
,
'9'
],
[
'领域'
,
'国防'
,
'107'
,
'6'
],
[
'领域'
,
'外交'
,
'108'
,
'50'
],
[
'领域'
,
'生态'
,
'105'
,
'7'
],
[
'领域'
,
'党建'
,
'106'
,
'9'
],
[
'领域'
,
'国防'
,
'107'
,
'6'
],
[
'领域'
,
'外交'
,
'108'
,
'50'
],
]
]
dic_lables
=
{
"类型|讲话"
:
"important_speech_type|1700334917807710209"
,
"类型|会议"
:
"important_speech_type|1700334936166178818"
,
"类型|活动"
:
"important_speech_type|1700334960560250881"
,
"类型|考察"
:
"important_speech_type|1700334978285379585"
,
"类型|会见"
:
"important_speech_type|1700335044605714433"
,
"类型|出访"
:
"important_speech_type|1700335078852206593"
,
"类型|函电"
:
"important_speech_type|1700335099689508866"
,
"类型|其他"
:
"important_speech_type|1700335118056366082"
,
"时间|2012"
:
"important_speech_time|1700334545970077697"
,
"时间|2013"
:
"important_speech_time|1700334647757447170"
,
"时间|2014"
:
"important_speech_time|1700334667915272194"
,
"时间|2015"
:
"important_speech_time|1700334686550564865"
,
"时间|2016"
:
"important_speech_time|1700334704925810689"
,
"时间|2017"
:
"important_speech_time|1700334722529304578"
,
"时间|2018"
:
"important_speech_time|1700334738320859137"
,
"时间|2019"
:
"important_speech_time|1700334758302523393"
,
"时间|2020"
:
"important_speech_time|1700334777827008514"
,
"时间|2021"
:
"important_speech_time|1700334797477322753"
,
"时间|2022"
:
"important_speech_time|1700334814468448258"
,
"时间|2023"
:
"important_speech_time|1700334832495566850"
,
"领域|经济"
:
"important_speech_area|1700335225803841537"
,
"领域|政治"
:
"important_speech_area|1700335248096567297"
,
"领域|文化"
:
"important_speech_area|1700335379638329345"
,
"领域|社会"
:
"important_speech_area|1700335412873994242"
,
"领域|生态"
:
"important_speech_area|1700335541211308033"
,
"领域|党建"
:
"important_speech_area|1700335587780665346"
,
"领域|国防"
:
"important_speech_area|1700335615895085058"
,
"领域|外交"
:
"important_speech_area|1700335820430319618"
,
}
for
result_list
in
result_lists
:
for
result_list
in
result_lists
:
sort
=
result_list
[
0
]
sort
=
result_list
[
0
]
sort_text
=
result_list
[
1
]
sort_text
=
result_list
[
1
]
...
@@ -96,13 +234,19 @@ def get_content():
...
@@ -96,13 +234,19 @@ def get_content():
if
sort
==
'类型'
:
if
sort
==
'类型'
:
form
=
result_list
[
2
]
form
=
result_list
[
2
]
type_
=
'0'
type_
=
'0'
year
=
'0'
elif
sort
==
'时间'
:
form
=
'0'
type_
=
'0'
year
=
result_list
[
2
]
else
:
else
:
form
=
'0'
form
=
'0'
type_
=
result_list
[
2
]
type_
=
result_list
[
2
]
year
=
'0'
# total_page = result_list[3]
# total_page = result_list[3]
total_page
=
2
total_page
=
10
for
page
in
range
(
1
,
int
(
total_page
)):
for
page
in
range
(
1
,
int
(
total_page
)):
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=
0
&form={form}&type={type_}&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
url
=
f
"http://jhsjk.people.cn/testnew/result?keywords=&isFuzzy=0&searchArea=0&year=
{year}
&form={form}&type={type_}&page={page}&origin=
%
E5
%85%
A8
%
E9
%83%
A8&source=2"
payload
=
{}
payload
=
{}
try
:
try
:
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
,
data
=
payload
)
.
json
()
resp_json
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
,
data
=
payload
)
.
json
()
...
@@ -122,52 +266,24 @@ def get_content():
...
@@ -122,52 +266,24 @@ def get_content():
type_lists
=
[
type_dict
]
type_lists
=
[
type_dict
]
else
:
else
:
type_lists
=
type_list
+
[
type_dict
]
type_lists
=
type_list
+
[
type_dict
]
art_type_dict
[
article_id
]
=
type_lists
new_lst
=
[]
for
key
,
value
in
art_content_dict
.
items
():
tags
=
art_type_dict
.
get
(
key
)
# 遍历原列表的字典元素
if
tags
is
None
:
for
item
in
type_lists
:
tags
=
[]
# 如果字典元素不在新列表中,则添加到新列表中
value
[
'tags'
]
=
tags
if
item
not
in
new_lst
:
post_dict
=
value
new_lst
.
append
(
item
)
db_storage
.
update_one
({
'id'
:
post_dict
[
'id'
]},
{
'$set'
:
{
'tags'
:
tags
}})
if
post_dict
[
'is_repeat'
]
==
'1'
:
art_type_dict
[
article_id
]
=
new_lst
continue
try
:
newsdata
(
art_content_dict
,
art_type_dict
,
dic_lables
)
del
post_dict
[
'is_repeat'
]
# labels = []
# for tags_dict in post_dict['tags']:
# labels_dict = {
# 'abelRemarks': tags_dict.get('type'),
# 'relationName': tags_dict.get('name'),
# }
# labels.append(labels_dict)
# aaa_dict = {
# 'sid': '1533647545473859586',
# 'title': post_dict['title'],
# 'content': '',
# 'contentWithTag': post_dict['content'],
# 'summary': '',
# 'author': '',
# 'origin': post_dict['origin'],
# 'publishDate': post_dict['publishDate'],
# 'sourceAddress': post_dict['sourceAddress'],
# 'labels': labels
# }
post_url
=
'http://114.116.19.92:8088/api/reptile/autoSaveXJPSpeak'
headers
=
{
'Content-Type'
:
'application/json'
}
resp_json
=
requests
.
post
(
url
=
post_url
,
headers
=
headers
,
verify
=
False
,
data
=
json
.
dumps
(
post_dict
))
.
json
()
print
(
'推送:'
,
resp_json
[
'msg'
])
except
:
print
(
'数据传接口失败,正在重试!'
)
time
.
sleep
(
5
)
db_storage
.
delete_one
({
'id'
:
post_dict
[
'id'
]})
continue
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
try
:
try
:
get_content
()
get_content
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
pass
pass
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论