Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
8fb1c602
提交
8fb1c602
authored
9月 11, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
7424d8e4
abf7739a
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
527 行增加
和
306 行删除
+527
-306
2.py
comData/policylaw/2.py
+527
-306
没有找到文件。
comData/policylaw/2.py
浏览文件 @
8fb1c602
...
...
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from
kafka
import
KafkaProducer
from
pyquery
import
PyQuery
as
pq
from
requests.packages
import
urllib3
from
requests.adapters
import
HTTPAdapter
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
...
...
@@ -116,7 +116,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
#
return True
return
True
except
Exception
as
e
:
...
...
@@ -130,6 +130,7 @@ def sendKafka(dic_news):
e
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
return
False
def
redefid
(
idList
):
...
...
@@ -140,8 +141,39 @@ def redefid(idList):
def
remove_dup
():
pass
# 国务院文件
def
get_content1
():
def
getPageConunt
(
a_list
,
url
,
headers
,
s
):
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
1
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
pageCount
=
res_text
[
'result'
][
'data'
][
'pager'
][
'pageCount'
]
return
pageCount
def
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
):
# post请求所需参数
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
return
page_list
start_time
=
time
.
time
()
num
=
0
# 过网站验证所需 athenaAppKey athenaAppName
...
...
@@ -172,86 +204,142 @@ def get_content1():
result_list
=
[[
'国令'
,
"1108"
],
[
'国发'
,
"1107"
],
[
'国函'
,
"1106"
],
[
'国发明电'
,
"1105"
],
[
'国办发'
,
"1104"
],
[
'国办函'
,
"1103"
],
[
'国办发明电'
,
"1102"
],
[
'其他'
,
"1101"
]]
try
:
for
a_list
in
result_list
:
s
=
requests
.
session
()
s
.
keep_alive
=
False
pageNo
=
1
pcodeJiguan
=
a_list
[
0
]
# post请求所需参数
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
data
=
json
.
dumps
(
data
)
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
s
.
close
()
for
page
in
page_list
:
# 获取所需信息
title
=
page
[
'maintitle'
]
pub_time1
=
page
[
'publish_time'
]
pub_time2
=
page
[
'cwrq'
]
pub_code
=
page
[
'fwzh'
]
href
=
page
[
'pub_url'
]
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
continue
for
a_list
in
result_list
:
s
=
requests
.
session
()
s
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
s
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
s
.
keep_alive
=
False
pcodeJiguan
=
a_list
[
0
]
try
:
pageCount
=
getPageConunt
(
a_list
,
url
,
headers
,
s
)
for
pageNo
in
range
(
1
,
pageCount
+
1
):
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
i_html
=
resp_href
.
text
if
'您访问的页面不存在或已删除'
in
i_html
:
continue
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
content
=
str
(
i_soup
.
find
(
'table'
,
attrs
=
{
'class'
:
'pages_content'
}))
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
fu_jian_href_list
=
[]
if
len
(
fu_jian_result
)
>
0
:
for
fu_jian_re
in
fu_jian_result
:
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
fu_jian_href
=
fu_jian_re
fu_jian_href_list
.
append
(
fu_jian_href
)
result_dict
=
{
'标题'
:
title
,
'来源'
:
''
,
'发文机关'
:
pub_org
,
'发文字号'
:
pub_code
,
'内容-未去标签'
:
content
,
'附件网址'
:
fu_jian_href_list
,
'发布时间'
:
pub_time1
,
'成文时间'
:
pub_time2
,
'主题分类'
:
child_type
,
'网址'
:
href
,
'归属'
:
pcodeJiguan
,
'信息来源'
:
'国务院文件'
,
'tid'
:
1766
,
}
resp_href
.
close
()
print
(
title
)
# save_data(result_dict)
# time.sleep(1)
num
+=
1
try
:
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
except
:
s
.
close
()
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
for
page
in
page_list
:
id_list
=
[]
# 获取所需信息
title
=
page
[
'maintitle'
]
# 标题
pub_time1
=
page
[
'publish_time'
]
# 发布时间
pub_time2
=
page
[
'cwrq'
]
# 成文时间
pub_code
=
page
[
'fwzh'
]
# 发文字号
href
=
page
[
'pub_url'
]
# 网址
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
i_html
=
resp_href
.
text
if
'您访问的页面不存在或已删除'
in
i_html
:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
paserUrl
(
i_soup
,
href
)
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 发文机关
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 主题分类
contentWithTag
=
i_soup
.
find
(
'div'
,
class_
=
'wrap mxxgkwrap mxxgkwrap_gwywj'
)
.
find
(
'table'
,
class_
=
'border-table noneBorder pages_content'
)
# 去除扫一扫
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
content
=
contentWithTag
.
text
# 不带标签正文
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
time
.
sleep
(
0.5
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1766'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
[
0
],
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
log
.
error
(
f
'{pcodeJiguan}...第{pageNo}页获取列表失败'
)
continue
except
:
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
continue
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
# 国务院部门文件
def
get_content2
():
def
getTotalpage
(
bmfl
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
pageNo
=
1
time
.
sleep
(
2
)
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
totalpage
=
resp_json
[
'searchVO'
][
'totalpage'
]
return
totalpage
def
getContentList
(
bmfl
,
pageNo
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
return
content_list
session
=
requests
.
session
()
session
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
keep_alive
=
False
start_time
=
time
.
time
()
num
=
0
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
...
...
@@ -271,171 +359,261 @@ def get_content2():
for
bmfl
in
result_list
:
try
:
pageNo
=
0
time
.
sleep
(
2
)
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
try
:
# 请求结果为json格式
resp
=
requests
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
resp
.
close
()
except
:
continue
for
content_dict
in
content_list
:
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
totalpage
=
getTotalpage
(
bmfl
,
headers
,
session
)
for
pageNo
in
range
(
1
,
totalpage
+
1
):
try
:
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
pub_time1
=
''
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
pub_time2
=
''
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
try
:
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
except
:
session
.
close
()
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
for
content_dict
in
content_list
:
id_list
=
[]
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
try
:
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
pub_time1
=
''
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
pub_time2
=
''
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
child_type
=
''
# # 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
time
.
sleep
(
0.5
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
content
=
contentWithTag
.
text
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1699'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
except
:
print
(
f
'{title}...{href}获取内容失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
child_type
=
''
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
print
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
'utf-8'
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
time
.
sleep
(
1
)
content
=
str
(
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
}))
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
fu_jian_href_list
=
[]
if
len
(
fu_jian_result
)
>
0
:
for
fu_jian_re
in
fu_jian_result
:
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
fu_jian_href
=
href
.
split
(
'content'
)[
0
]
+
fu_jian_re
fu_jian_href_list
.
append
(
fu_jian_href
)
resp
.
close
()
result_dict
=
{
'标题'
:
title
,
'来源'
:
''
,
'发文机关'
:
pub_org
,
'发文字号'
:
pub_code
,
'内容-未去标签'
:
content
,
'附件网址'
:
fu_jian_href_list
,
'发布时间'
:
pub_time1
,
'成文时间'
:
pub_time2
,
'主题分类'
:
child_type
,
'网址'
:
href
,
'归属'
:
bmfl
,
'信息来源'
:
'国务院部门文件'
,
'tid'
:
1699
,
}
print
(
title
)
save_data
(
result_dict
)
num
+=
1
except
:
pass
except
:
pass
print
(
f
'{bmfl}...获取页数失败'
)
continue
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
# 国务院国有资产监督管理委员会-政策发布
def
get_content3
():
start_time
=
time
.
time
()
num
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try
:
# get请求,需要取消ssl验证
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.zsy_conlist li'
)
.
items
()
time
.
sleep
(
1
)
for
doc_item
in
doc_items
:
# 获取所需数据
def
getPage
():
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
totalpage
=
re
.
findall
(
"maxPageNum = (.*);"
,
soup
.
select
(
'#pag_2603340'
)[
0
]
.
text
)[
0
]
return
int
(
totalpage
)
def
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
):
id_list
=
[]
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
soup
=
BeautifulSoup
(
resp_href
.
text
,
'lxml'
)
soup
=
paserUrl
(
soup
,
href
)
doc_href
=
soup
.
find
(
'div'
,
class_
=
'zsy_content'
)
try
:
org_content
=
doc_href
.
select
(
'.zsy_cotitle'
)[
0
]
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zsy_comain'
)
contentWithTag
.
select
(
'#qr_container'
)[
0
]
.
decompose
()
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'related'
)
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'jiathis_style_24x24'
)
.
decompose
()
try
:
p_list
=
contentWithTag
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
15
:
pub_hao
=
''
content
=
contentWithTag
.
text
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
href_
=
doc_item
(
'a'
)
.
attr
(
'href'
)
if
href_
is
None
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1642'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1642"
,
'relationName'
:
"国务院国资委"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
org
,
#政策发文机关
'topicClassification'
:
''
,
#政策文件分类
'issuedNumber'
:
pub_hao
,
#发文字号
'publishDate'
:
pub_time
,
#发布时间
'writtenDate'
:
''
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
def
partTwo
():
start_time
=
time
.
time
()
num
=
0
totalpage
=
getPage
()
for
page
in
range
(
1
,
totalpage
):
url
=
f
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
li_list
=
resp_text
.
split
(
'<li>'
)
del
(
li_list
[
0
])
for
li
in
li_list
:
id_list
=
[]
href_
=
li
.
split
(
'<a href="'
)[
1
]
.
split
(
'" target='
)[
0
]
title
=
li
.
split
(
'title="'
)[
1
]
.
split
(
'">'
)[
0
]
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过
pub_time
=
li
.
split
(
'<span>['
)[
1
]
.
split
(
']</span>'
)[
0
]
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
except
:
continue
try
:
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
num
+=
1
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
def
partOne
():
start_time
=
time
.
time
()
num
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try
:
# get请求,需要取消ssl验证
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.zsy_conlist li'
)
.
items
()
time
.
sleep
(
1
)
for
doc_item
in
doc_items
:
# 获取所需数据
try
:
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
doc_href
=
pq
(
resp_href
.
content
)
time
.
sleep
(
1
)
content_html
=
str
(
doc_href
(
'.zsy_comain'
)
.
remove
(
'style'
)
.
remove
(
'#qr_container'
))
content
=
pq
(
content_html
)
.
text
()
href_
=
doc_item
(
'a'
)
.
attr
(
'href'
)
if
href_
is
None
:
continue
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
except
:
continue
if
content
.
strip
()
==
''
:
continue
try
:
org_content
=
doc_href
(
'.zsy_cotitle'
)
.
text
()
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
try
:
resp_href
.
encoding
=
'utf-8'
resp_text_
=
BeautifulSoup
(
resp_href
.
text
,
'html.parser'
)
zsy_comain
=
resp_text_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_comain'
})
p_list
=
zsy_comain
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
45
:
pub_hao
=
''
result_dict
=
{
'标题'
:
title
,
'来源'
:
org
,
'发文机关'
:
''
,
'发文字号'
:
pub_hao
,
'内容-未去标签'
:
content_html
,
'附件网址'
:
[],
'发布时间'
:
pub_time
,
'成文时间'
:
''
,
'主题分类'
:
''
,
'网址'
:
href
,
'归属'
:
'国务院国资委'
,
'信息来源'
:
'国务院国资委'
,
'tid'
:
1642
,
}
save_data
(
result_dict
)
print
(
title
)
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
num
+=
1
except
:
pass
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
partOne
()
partTwo
()
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
...
...
@@ -580,8 +758,9 @@ def bei_jing():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# id_list.append(id)
num
+=
1
...
...
@@ -698,9 +877,10 @@ def nei_meng_gu():
'summary'
:
''
,
'title'
:
title
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
except
:
...
...
@@ -890,8 +1070,9 @@ def ji_lin():
continue
else
:
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
except
Exception
as
e
:
print
(
e
)
...
...
@@ -1024,8 +1205,9 @@ def shang_hai():
'summary'
:
''
,
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
except
:
pass
...
...
@@ -1143,8 +1325,9 @@ def zhe_jiang():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
except
:
...
...
@@ -1301,8 +1484,9 @@ def fu_jian():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
+=
1
except
:
...
...
@@ -1410,8 +1594,9 @@ def shan_dong():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
if
content
==
''
or
content
==
'None'
:
continue
else
:
...
...
@@ -1512,8 +1697,9 @@ def guang_dong():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
# save_data(result_dict)
num
=
num
+
1
...
...
@@ -1697,8 +1883,9 @@ def hai_nan():
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
=
num
+
1
...
...
@@ -1768,8 +1955,9 @@ def hai_nan():
'summary'
:
''
,
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
# save_data(result_dict)
print
(
title
)
...
...
@@ -1873,8 +2061,9 @@ def hai_nan():
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
# save_data(result_dict)
print
(
title
)
...
...
@@ -1979,8 +2168,9 @@ def hai_nan():
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
# save_data(result_dict)
print
(
title
)
...
...
@@ -2065,8 +2255,9 @@ def hai_nan():
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
# save_data(result_dict)
print
(
title
)
...
...
@@ -2238,8 +2429,9 @@ def si_chuan():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
=
num
+
1
...
...
@@ -2363,8 +2555,9 @@ def guang_xi():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
=
num
+
1
except
:
...
...
@@ -2471,8 +2664,9 @@ def gui_zhou():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
# save_data(result_dict)
num
=
num
+
1
...
...
@@ -2584,8 +2778,9 @@ def yun_nan():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
=
num
+
1
except
:
...
...
@@ -2696,8 +2891,9 @@ def yun_nan():
'title'
:
title
}
# print(dic_news)
# sendKafka(dic_news)
# save_data(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
num
=
num
+
1
...
...
@@ -2826,8 +3022,9 @@ def chong_qing():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
# save_data(result_dict)
num
+=
1
...
...
@@ -2951,8 +3148,9 @@ def tian_jin():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3073,8 +3271,9 @@ def tian_jin():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3199,8 +3398,9 @@ def tian_jin():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3306,8 +3506,9 @@ def xin_jiang():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3403,8 +3604,9 @@ def xin_jiang():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
href_res
.
close
()
except
:
...
...
@@ -3521,8 +3723,9 @@ def shan_xi():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3630,8 +3833,9 @@ def liao_ning():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3723,8 +3927,9 @@ def hei_long_jiang():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3836,8 +4041,9 @@ def jiang_su():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -3930,8 +4136,9 @@ def an_hui():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -4025,8 +4232,9 @@ def an_hui():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
href_res
.
close
()
except
:
...
...
@@ -4158,8 +4366,9 @@ def jiang_xi():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -4250,8 +4459,9 @@ def he_nan():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
href_res
.
close
()
resp_text
.
close
()
...
...
@@ -4351,8 +4561,9 @@ def hu_nan():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -4472,8 +4683,9 @@ def gan_su():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
Exception
as
e
:
print
(
e
)
...
...
@@ -4607,8 +4819,9 @@ def gan_su():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
Exception
as
e
:
print
(
e
)
...
...
@@ -4763,8 +4976,9 @@ def gan_su():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
Exception
as
e
:
print
(
e
)
...
...
@@ -4862,8 +5076,9 @@ def ning_xia():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -4960,8 +5175,9 @@ def shanxi():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
res_href
.
close
()
except
:
...
...
@@ -5053,8 +5269,9 @@ def xi_zang():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -5148,8 +5365,9 @@ def qing_hai():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# id_list.append(id)
num
+=
1
...
...
@@ -5265,8 +5483,9 @@ def qing_hai():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# id_list.append(id)
num
+=
1
...
...
@@ -5363,8 +5582,9 @@ def he_bei():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
pass
...
...
@@ -5471,8 +5691,9 @@ def hu_bei():
'title'
:
title
}
# print(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
Exception
as
e
:
pass
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论