Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
9ab6c127
提交
9ab6c127
authored
9月 09, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规 9/9
上级
eeb41ef7
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
415 行增加
和
193 行删除
+415
-193
2.py
comData/policylaw/2.py
+415
-193
没有找到文件。
comData/policylaw/2.py
浏览文件 @
9ab6c127
...
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
...
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
pyquery
import
PyQuery
as
pq
from
pyquery
import
PyQuery
as
pq
from
requests.packages
import
urllib3
from
requests.packages
import
urllib3
from
requests.adapters
import
HTTPAdapter
from
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
...
@@ -110,7 +110,7 @@ def sendKafka(dic_news):
...
@@ -110,7 +110,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中
# 传输成功,写入日志中
state
=
1
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
#
return True
return
True
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -124,6 +124,7 @@ def sendKafka(dic_news):
...
@@ -124,6 +124,7 @@ def sendKafka(dic_news):
e
=
'Kafka操作失败'
e
=
'Kafka操作失败'
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
return
False
def
redefid
(
idList
):
def
redefid
(
idList
):
id_
=
','
.
join
(
map
(
str
,
idList
))
id_
=
','
.
join
(
map
(
str
,
idList
))
...
@@ -132,7 +133,38 @@ def redefid(idList):
...
@@ -132,7 +133,38 @@ def redefid(idList):
def
remove_dup
():
def
remove_dup
():
pass
pass
# 国务院文件
def
get_content1
():
def
get_content1
():
def
getPageConunt
(
a_list
,
url
,
headers
,
s
):
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
1
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
pageCount
=
res_text
[
'result'
][
'data'
][
'pager'
][
'pageCount'
]
return
pageCount
def
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
):
# post请求所需参数
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
return
page_list
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
# 过网站验证所需 athenaAppKey athenaAppName
# 过网站验证所需 athenaAppKey athenaAppName
...
@@ -163,86 +195,142 @@ def get_content1():
...
@@ -163,86 +195,142 @@ def get_content1():
result_list
=
[[
'国令'
,
"1108"
],
[
'国发'
,
"1107"
],
[
'国函'
,
"1106"
],
[
'国发明电'
,
"1105"
],
[
'国办发'
,
"1104"
],
result_list
=
[[
'国令'
,
"1108"
],
[
'国发'
,
"1107"
],
[
'国函'
,
"1106"
],
[
'国发明电'
,
"1105"
],
[
'国办发'
,
"1104"
],
[
'国办函'
,
"1103"
],
[
'国办函'
,
"1103"
],
[
'国办发明电'
,
"1102"
],
[
'其他'
,
"1101"
]]
[
'国办发明电'
,
"1102"
],
[
'其他'
,
"1101"
]]
try
:
for
a_list
in
result_list
:
for
a_list
in
result_list
:
s
=
requests
.
session
()
s
=
requests
.
session
()
s
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
s
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
s
.
keep_alive
=
False
s
.
keep_alive
=
False
pageNo
=
1
pcodeJiguan
=
a_list
[
0
]
pcodeJiguan
=
a_list
[
0
]
# post请求所需参数
try
:
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
pageCount
=
getPageConunt
(
a_list
,
url
,
headers
,
s
)
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
for
pageNo
in
range
(
1
,
pageCount
+
1
):
"trackTotalHits"
:
"true"
,
try
:
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
try
:
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
except
:
data
=
json
.
dumps
(
data
)
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
s
.
close
()
s
.
close
()
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
for
page
in
page_list
:
for
page
in
page_list
:
id_list
=
[]
# 获取所需信息
# 获取所需信息
title
=
page
[
'maintitle'
]
title
=
page
[
'maintitle'
]
# 标题
pub_time1
=
page
[
'publish_time'
]
pub_time1
=
page
[
'publish_time'
]
# 发布时间
pub_time2
=
page
[
'cwrq'
]
pub_time2
=
page
[
'cwrq'
]
# 成文时间
pub_code
=
page
[
'fwzh'
]
pub_code
=
page
[
'fwzh'
]
# 发文字号
href
=
page
[
'pub_url'
]
href
=
page
[
'pub_url'
]
# 网址
# 判断是否已经爬取过
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
continue
try
:
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
resp_href
.
encoding
=
resp_href
.
apparent_encoding
i_html
=
resp_href
.
text
i_html
=
resp_href
.
text
if
'您访问的页面不存在或已删除'
in
i_html
:
if
'您访问的页面不存在或已删除'
in
i_html
:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue
continue
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
paserUrl
(
i_soup
,
href
)
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
0
]
# 发文机关
content
=
str
(
i_soup
.
find
(
'table'
,
attrs
=
{
'class'
:
'pages_content'
}))
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 主题分类
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
contentWithTag
=
i_soup
.
find
(
'div'
,
class_
=
'wrap mxxgkwrap mxxgkwrap_gwywj'
)
.
find
(
'table'
,
class_
=
'border-table noneBorder pages_content'
)
fu_jian_href_list
=
[]
# 去除扫一扫
if
len
(
fu_jian_result
)
>
0
:
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
for
fu_jian_re
in
fu_jian_result
:
content
=
contentWithTag
.
text
# 不带标签正文
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
time
.
sleep
(
0.5
)
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
for
file
in
fu_jian_soup
:
fu_jian_href
=
fu_jian_re
try
:
fu_jian_href_list
.
append
(
fu_jian_href
)
file_href
=
file
[
'href'
]
result_dict
=
{
except
Exception
as
e
:
'标题'
:
title
,
log
.
info
(
f
'---{href}--------{e}-------'
)
'来源'
:
''
,
continue
'发文机关'
:
pub_org
,
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
'发文字号'
:
pub_code
,
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
'内容-未去标签'
:
content
,
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
'附件网址'
:
fu_jian_href_list
,
file_name
=
file
.
text
.
strip
()
'发布时间'
:
pub_time1
,
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1766'
)
'成文时间'
:
pub_time2
,
if
retData
[
'state'
]:
'主题分类'
:
child_type
,
pass
'网址'
:
href
,
else
:
'归属'
:
pcodeJiguan
,
continue
'信息来源'
:
'国务院文件'
,
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
'tid'
:
1766
,
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
[
0
],
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
}
resp_href
.
close
(
)
# print(dic_news
)
print
(
title
)
flag
=
sendKafka
(
dic_news
)
# save_data(result_dict)
if
flag
:
# time.sleep(1
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
log
.
error
(
f
'{pcodeJiguan}...第{pageNo}页获取列表失败'
)
continue
except
:
except
:
pass
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
# 国务院部门文件
# 国务院部门文件
def
get_content2
():
def
get_content2
():
def
getTotalpage
(
bmfl
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
pageNo
=
1
time
.
sleep
(
2
)
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
totalpage
=
resp_json
[
'searchVO'
][
'totalpage'
]
return
totalpage
def
getContentList
(
bmfl
,
pageNo
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
return
content_list
session
=
requests
.
session
()
session
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
keep_alive
=
False
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
...
@@ -262,20 +350,16 @@ def get_content2():
...
@@ -262,20 +350,16 @@ def get_content2():
for
bmfl
in
result_list
:
for
bmfl
in
result_list
:
try
:
try
:
pageNo
=
0
totalpage
=
getTotalpage
(
bmfl
,
headers
,
session
)
time
.
sleep
(
2
)
for
pageNo
in
range
(
1
,
totalpage
+
1
):
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
try
:
try
:
# 请求结果为json格式
try
:
resp
=
requests
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
)
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
resp
.
close
()
except
:
except
:
continue
session
.
close
()
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
for
content_dict
in
content_list
:
for
content_dict
in
content_list
:
id_list
=
[]
href
=
content_dict
[
'url'
]
# 详情页
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
...
@@ -294,55 +378,198 @@ def get_content2():
...
@@ -294,55 +378,198 @@ def get_content2():
child_type
=
content_dict
[
'childtype'
]
# 主题分类
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
except
:
child_type
=
''
child_type
=
''
# 判断是否已经爬取过
#
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
continue
try
:
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
'utf-8'
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
time
.
sleep
(
1
)
soup
=
paserUrl
(
soup
,
href
)
content
=
str
(
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
}))
time
.
sleep
(
0.5
)
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
fu_jian_href_list
=
[]
content
=
contentWithTag
.
text
if
len
(
fu_jian_result
)
>
0
:
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
fu_jian_re
in
fu_jian_result
:
for
file
in
fu_jian_soup
:
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
try
:
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
file_href
=
file
[
'href'
]
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
except
Exception
as
e
:
fu_jian_href
=
href
.
split
(
'content'
)[
0
]
+
fu_jian_re
log
.
info
(
f
'---{href}--------{e}-------'
)
fu_jian_href_list
.
append
(
fu_jian_href
)
continue
resp
.
close
()
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
result_dict
=
{
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
'标题'
:
title
,
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
'来源'
:
''
,
file_name
=
file
.
text
.
strip
()
'发文机关'
:
pub_org
,
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1699'
)
'发文字号'
:
pub_code
,
if
retData
[
'state'
]:
'内容-未去标签'
:
content
,
pass
'附件网址'
:
fu_jian_href_list
,
else
:
'发布时间'
:
pub_time1
,
continue
'成文时间'
:
pub_time2
,
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
'主题分类'
:
child_type
,
id_list
.
append
(
att_id
)
'网址'
:
href
,
'归属'
:
bmfl
,
#todo:将返回的地址更新到soup
'信息来源'
:
'国务院部门文件'
,
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
'tid'
:
1699
,
except
:
print
(
f
'{title}...{href}获取内容失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
}
print
(
title
)
# print(dic_news)
save_data
(
result_dict
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
print
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
continue
except
:
except
:
pass
print
(
f
'{bmfl}...获取页数失败'
)
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
# 国务院国有资产监督管理委员会-政策发布
# 国务院国有资产监督管理委员会-政策发布
def
get_content3
():
def
get_content3
():
def
getPage
():
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
totalpage
=
re
.
findall
(
"maxPageNum = (.*);"
,
soup
.
select
(
'#pag_2603340'
)[
0
]
.
text
)[
0
]
return
int
(
totalpage
)
def
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
):
id_list
=
[]
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
soup
=
BeautifulSoup
(
resp_href
.
text
,
'lxml'
)
soup
=
paserUrl
(
soup
,
href
)
doc_href
=
soup
.
find
(
'div'
,
class_
=
'zsy_content'
)
try
:
org_content
=
doc_href
.
select
(
'.zsy_cotitle'
)[
0
]
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zsy_comain'
)
contentWithTag
.
select
(
'#qr_container'
)[
0
]
.
decompose
()
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'related'
)
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'jiathis_style_24x24'
)
.
decompose
()
try
:
p_list
=
contentWithTag
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
15
:
pub_hao
=
''
content
=
contentWithTag
.
text
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1642'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1642"
,
'relationName'
:
"国务院国资委"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
org
,
#政策发文机关
'topicClassification'
:
''
,
#政策文件分类
'issuedNumber'
:
pub_hao
,
#发文字号
'publishDate'
:
pub_time
,
#发布时间
'writtenDate'
:
''
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
def
partTwo
():
start_time
=
time
.
time
()
num
=
0
totalpage
=
getPage
()
for
page
in
range
(
1
,
totalpage
):
url
=
f
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
li_list
=
resp_text
.
split
(
'<li>'
)
del
(
li_list
[
0
])
for
li
in
li_list
:
id_list
=
[]
href_
=
li
.
split
(
'<a href="'
)[
1
]
.
split
(
'" target='
)[
0
]
title
=
li
.
split
(
'title="'
)[
1
]
.
split
(
'">'
)[
0
]
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
pub_time
=
li
.
split
(
'<span>['
)[
1
]
.
split
(
']</span>'
)[
0
]
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
num
+=
1
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
def
partOne
():
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
...
@@ -363,70 +590,22 @@ def get_content3():
...
@@ -363,70 +590,22 @@ def get_content3():
# 判断是否已经爬取过
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
except
:
except
:
continue
continue
try
:
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
try
:
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
doc_href
=
pq
(
resp_href
.
content
)
time
.
sleep
(
1
)
content_html
=
str
(
doc_href
(
'.zsy_comain'
)
.
remove
(
'style'
)
.
remove
(
'#qr_container'
))
content
=
pq
(
content_html
)
.
text
()
except
:
continue
if
content
.
strip
()
==
''
:
continue
try
:
org_content
=
doc_href
(
'.zsy_cotitle'
)
.
text
()
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
try
:
resp_href
.
encoding
=
'utf-8'
resp_text_
=
BeautifulSoup
(
resp_href
.
text
,
'html.parser'
)
zsy_comain
=
resp_text_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_comain'
})
p_list
=
zsy_comain
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
45
:
pub_hao
=
''
result_dict
=
{
'标题'
:
title
,
'来源'
:
org
,
'发文机关'
:
''
,
'发文字号'
:
pub_hao
,
'内容-未去标签'
:
content_html
,
'附件网址'
:
[],
'发布时间'
:
pub_time
,
'成文时间'
:
''
,
'主题分类'
:
''
,
'网址'
:
href
,
'归属'
:
'国务院国资委'
,
'信息来源'
:
'国务院国资委'
,
'tid'
:
1642
,
}
save_data
(
result_dict
)
print
(
title
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
except
:
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
partOne
()
partTwo
()
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
urllib.parse
import
urljoin
...
@@ -569,7 +748,8 @@ def bei_jing():
...
@@ -569,7 +748,8 @@ def bei_jing():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
...
@@ -687,8 +867,9 @@ def nei_meng_gu():
...
@@ -687,8 +867,9 @@ def nei_meng_gu():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
...
@@ -872,7 +1053,8 @@ def ji_lin():
...
@@ -872,7 +1053,8 @@ def ji_lin():
continue
continue
else
:
else
:
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -1006,7 +1188,8 @@ def shang_hai():
...
@@ -1006,7 +1188,8 @@ def shang_hai():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
...
@@ -1123,7 +1306,8 @@ def zhe_jiang():
...
@@ -1123,7 +1306,8 @@ def zhe_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
...
@@ -1278,7 +1462,8 @@ def fu_jian():
...
@@ -1278,7 +1462,8 @@ def fu_jian():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
+=
1
num
+=
1
...
@@ -1386,7 +1571,8 @@ def shan_dong():
...
@@ -1386,7 +1571,8 @@ def shan_dong():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
if
content
==
''
or
content
==
'None'
:
if
content
==
''
or
content
==
'None'
:
continue
continue
...
@@ -1485,7 +1671,8 @@ def guang_dong():
...
@@ -1485,7 +1671,8 @@ def guang_dong():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
...
@@ -1656,7 +1843,8 @@ def hai_nan():
...
@@ -1656,7 +1843,8 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
...
@@ -1724,7 +1912,8 @@ def hai_nan():
...
@@ -1724,7 +1912,8 @@ def hai_nan():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
...
@@ -1826,7 +2015,8 @@ def hai_nan():
...
@@ -1826,7 +2015,8 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
...
@@ -1929,7 +2119,8 @@ def hai_nan():
...
@@ -1929,7 +2119,8 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
...
@@ -2012,7 +2203,8 @@ def hai_nan():
...
@@ -2012,7 +2203,8 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
...
@@ -2182,7 +2374,8 @@ def si_chuan():
...
@@ -2182,7 +2374,8 @@ def si_chuan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
...
@@ -2304,7 +2497,8 @@ def guang_xi():
...
@@ -2304,7 +2497,8 @@ def guang_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -2409,7 +2603,8 @@ def gui_zhou():
...
@@ -2409,7 +2603,8 @@ def gui_zhou():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
...
@@ -2518,7 +2713,8 @@ def yun_nan():
...
@@ -2518,7 +2713,8 @@ def yun_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -2627,8 +2823,9 @@ def yun_nan():
...
@@ -2627,8 +2823,9 @@ def yun_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
# sendKafka(dic_news)
flag
=
sendKafka
(
dic_news
)
# save_data(dic_news)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -2751,7 +2948,8 @@ def chong_qing():
...
@@ -2751,7 +2948,8 @@ def chong_qing():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
...
@@ -2873,7 +3071,8 @@ def tian_jin():
...
@@ -2873,7 +3071,8 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -2992,7 +3191,8 @@ def tian_jin():
...
@@ -2992,7 +3191,8 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3115,7 +3315,8 @@ def tian_jin():
...
@@ -3115,7 +3315,8 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3221,7 +3422,8 @@ def xin_jiang():
...
@@ -3221,7 +3422,8 @@ def xin_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3318,7 +3520,8 @@ def xin_jiang():
...
@@ -3318,7 +3520,8 @@ def xin_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
...
@@ -3436,7 +3639,8 @@ def shan_xi():
...
@@ -3436,7 +3639,8 @@ def shan_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3544,7 +3748,8 @@ def liao_ning():
...
@@ -3544,7 +3748,8 @@ def liao_ning():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3638,7 +3843,8 @@ def hei_long_jiang():
...
@@ -3638,7 +3843,8 @@ def hei_long_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3751,7 +3957,8 @@ def jiang_su():
...
@@ -3751,7 +3957,8 @@ def jiang_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3841,7 +4048,8 @@ def an_hui():
...
@@ -3841,7 +4048,8 @@ def an_hui():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -3935,7 +4143,8 @@ def an_hui():
...
@@ -3935,7 +4143,8 @@ def an_hui():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
...
@@ -4062,7 +4271,8 @@ def jiang_xi():
...
@@ -4062,7 +4271,8 @@ def jiang_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -4154,7 +4364,8 @@ def he_nan():
...
@@ -4154,7 +4364,8 @@ def he_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
...
@@ -4251,7 +4462,8 @@ def hu_nan():
...
@@ -4251,7 +4462,8 @@ def hu_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -4372,7 +4584,8 @@ def gan_su():
...
@@ -4372,7 +4584,8 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -4506,7 +4719,8 @@ def gan_su():
...
@@ -4506,7 +4719,8 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -4661,7 +4875,8 @@ def gan_su():
...
@@ -4661,7 +4875,8 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -4759,7 +4974,8 @@ def ning_xia():
...
@@ -4759,7 +4974,8 @@ def ning_xia():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -4857,7 +5073,8 @@ def shanxi():
...
@@ -4857,7 +5073,8 @@ def shanxi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
res_href
.
close
()
res_href
.
close
()
...
@@ -4951,7 +5168,8 @@ def xi_zang():
...
@@ -4951,7 +5168,8 @@ def xi_zang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -5047,7 +5265,8 @@ def qing_hai():
...
@@ -5047,7 +5265,8 @@ def qing_hai():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
...
@@ -5164,7 +5383,8 @@ def qing_hai():
...
@@ -5164,7 +5383,8 @@ def qing_hai():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
...
@@ -5262,7 +5482,8 @@ def he_bei():
...
@@ -5262,7 +5482,8 @@ def he_bei():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -5370,7 +5591,8 @@ def hu_bei():
...
@@ -5370,7 +5591,8 @@ def hu_bei():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论