Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
394de490
提交
394de490
authored
3月 02, 2024
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中央经济会议 03/02
上级
ee09212f
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
132 行增加
和
1 行删除
+132
-1
zyjjhy.py
comData/important_meeting/zyjjhy.py
+131
-0
zyqmshggldxzhy19.py
comData/important_meeting/zyqmshggldxzhy19.py
+1
-1
没有找到文件。
comData/important_meeting/zyjjhy.py
0 → 100644
浏览文件 @
394de490
# 中央经济会议
import
datetime
import
json
import
re
import
time
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
base
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
(
sqlFlg
=
False
)
log
=
baseCore
.
getLogger
()
r
=
redis
.
Redis
(
host
=
'114.116.90.53'
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
5
)
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'http://www.12371.cn/'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'cross-site'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'
,
'sec-ch-ua'
:
'"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
def
is_member_containing_string
(
key
,
string
):
cursor
=
'0'
while
True
:
# 使用 SCAN 命令遍历 Set 列表
cursor
,
members
=
r
.
sscan
(
key
,
cursor
)
for
member
in
members
:
# 判断字符串是否包含指定字符串
if
string
in
member
.
decode
(
"utf-8"
):
return
True
if
cursor
==
b
'0'
or
cursor
==
0
:
break
return
False
def
sendKafka
(
dic_info
):
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
json
.
dumps
(
dic_info
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
# r.sadd(info_code + '-test', sourceAddress)
log
.
info
(
f
'{dic_info["title"]}发送kafka成功'
)
return
True
except
Exception
as
e
:
log
.
info
(
f
'{dic_info["title"]}发送kafka异常==={e}'
)
return
False
def
getData
(
year
,
summary
,
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
timeTag
=
soup
.
find
(
'i'
,
class_
=
'time'
)
.
text
.
strip
()
publishDate
=
timeTag
.
split
(
'发布时间:'
)[
1
]
.
split
(
'来源:'
)[
0
]
.
strip
()
publishDate
=
datetime
.
datetime
.
strptime
(
publishDate
,
"
%
Y年
%
m月
%
d日
%
H:
%
M"
)
publishDate
=
publishDate
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
title
=
soup
.
find
(
'h1'
,
class_
=
'big_title'
)
.
text
.
strip
()
title
=
f
'({year})'
+
title
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'word'
)
contentWithTag
.
find
(
'div'
,
class_
=
'bfq_img1220'
)
.
decompose
()
pList
=
contentWithTag
.
find_all
(
'p'
)
for
p
in
pList
:
if
p
.
text
.
strip
()
==
'延伸阅读'
:
p
.
decompose
()
a
=
p
.
find
(
'a'
)
if
a
:
p
.
decompose
()
scripts
=
contentWithTag
.
find_all
(
'script'
)
for
script
in
scripts
:
script
.
decompose
()
content
=
contentWithTag
.
text
time_now
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
dic_info
=
{
'id'
:
'1681549573150879745'
+
str
(
int
(
time
.
time
()
*
1000
)),
'title'
:
title
,
'origin'
:
'共产党员网'
,
'contentWithTag'
:
str
(
contentWithTag
),
'content'
:
content
,
'summary'
:
summary
,
'publishDate'
:
publishDate
,
'sid'
:
'1691634024094507010'
,
'subjectId'
:
'1681549573150879745'
,
'sourceAddress'
:
url
,
'checkStatus'
:
1
,
'deleteFlag'
:
0
,
'createDate'
:
time_now
,
}
return
dic_info
def
doJonb
():
info_code
=
'IN-20230816-0006'
url
=
'https://www.12371.cn/special/lczyjjgzhy/'
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
hTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'page_body'
})
.
find
(
'div'
,
class_
=
'dyw638_title_jj'
)
.
find
(
'h2'
)
href
=
hTag
.
find
(
'a'
)
.
get
(
'href'
)
summary
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'page_body'
})
.
find
(
'div'
,
class_
=
'dyw638_title_jj'
)
.
find
(
'p'
)
.
text
year
=
re
.
findall
(
'
\
d+年'
,
hTag
.
text
.
strip
())[
0
]
req_
=
requests
.
get
(
href
,
headers
=
headers
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
aList
=
soup_
.
find
(
'div'
,
class_
=
'word'
)
.
find_all
(
'a'
)
for
a
in
aList
:
if
a
.
text
.
strip
()
==
'【详细】'
:
href_
=
a
.
get
(
'href'
)
if
is_member_containing_string
(
info_code
,
href_
):
return
dic
=
getData
(
year
,
summary
,
href_
)
if
sendKafka
(
dic
):
r
.
sadd
(
info_code
,
href_
)
if
__name__
==
'__main__'
:
doJonb
()
comData/important_meeting/zyqmshggldxzhy19.py
浏览文件 @
394de490
...
...
@@ -52,7 +52,7 @@ headers = {
if
__name__
==
"__main__"
:
# 中央全面深化改革委员会会议
r
=
redis
.
Redis
(
host
=
'114.11
5.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
r
=
redis
.
Redis
(
host
=
'114.11
6.90.53'
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
5
)
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url
=
'https://www.12371.cn/special/zyqmshggldxzhy19/'
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论