Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
8fb1c602
提交
8fb1c602
authored
9月 11, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
7424d8e4
abf7739a
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
527 行增加
和
306 行删除
+527
-306
2.py
comData/policylaw/2.py
+527
-306
没有找到文件。
comData/policylaw/2.py
浏览文件 @
8fb1c602
...
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
...
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
pyquery
import
PyQuery
as
pq
from
pyquery
import
PyQuery
as
pq
from
requests.packages
import
urllib3
from
requests.packages
import
urllib3
from
requests.adapters
import
HTTPAdapter
from
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
...
@@ -116,7 +116,7 @@ def sendKafka(dic_news):
...
@@ -116,7 +116,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中
# 传输成功,写入日志中
state
=
1
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
#
return True
return
True
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -130,6 +130,7 @@ def sendKafka(dic_news):
...
@@ -130,6 +130,7 @@ def sendKafka(dic_news):
e
=
'Kafka操作失败'
e
=
'Kafka操作失败'
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
return
False
def
redefid
(
idList
):
def
redefid
(
idList
):
...
@@ -140,8 +141,39 @@ def redefid(idList):
...
@@ -140,8 +141,39 @@ def redefid(idList):
def
remove_dup
():
def
remove_dup
():
pass
pass
# 国务院文件
def
get_content1
():
def
get_content1
():
def
getPageConunt
(
a_list
,
url
,
headers
,
s
):
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
1
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
pageCount
=
res_text
[
'result'
][
'data'
][
'pager'
][
'pageCount'
]
return
pageCount
def
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
):
# post请求所需参数
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
data
=
json
.
dumps
(
data
)
ip
=
baseCore
.
get_proxy
()
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
,
proxies
=
ip
)
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
return
page_list
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
# 过网站验证所需 athenaAppKey athenaAppName
# 过网站验证所需 athenaAppKey athenaAppName
...
@@ -172,86 +204,142 @@ def get_content1():
...
@@ -172,86 +204,142 @@ def get_content1():
result_list
=
[[
'国令'
,
"1108"
],
[
'国发'
,
"1107"
],
[
'国函'
,
"1106"
],
[
'国发明电'
,
"1105"
],
[
'国办发'
,
"1104"
],
result_list
=
[[
'国令'
,
"1108"
],
[
'国发'
,
"1107"
],
[
'国函'
,
"1106"
],
[
'国发明电'
,
"1105"
],
[
'国办发'
,
"1104"
],
[
'国办函'
,
"1103"
],
[
'国办函'
,
"1103"
],
[
'国办发明电'
,
"1102"
],
[
'其他'
,
"1101"
]]
[
'国办发明电'
,
"1102"
],
[
'其他'
,
"1101"
]]
try
:
for
a_list
in
result_list
:
for
a_list
in
result_list
:
s
=
requests
.
session
()
s
=
requests
.
session
()
s
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
s
.
keep_alive
=
False
s
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
pageNo
=
1
s
.
keep_alive
=
False
pcodeJiguan
=
a_list
[
0
]
pcodeJiguan
=
a_list
[
0
]
# post请求所需参数
try
:
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
pageCount
=
getPageConunt
(
a_list
,
url
,
headers
,
s
)
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
for
pageNo
in
range
(
1
,
pageCount
+
1
):
"trackTotalHits"
:
"true"
,
"searchFields"
:
[{
"fieldName"
:
"maintitle"
,
"searchWord"
:
""
}],
"isPreciseSearch"
:
0
,
"sorts"
:
[{
"sortField"
:
"publish_time"
,
"sortOrder"
:
"DESC"
}],
"childrenInfoIds"
:
[[
a_list
[
1
]]],
"pageSize"
:
20
,
"pageNo"
:
pageNo
}
data
=
json
.
dumps
(
data
)
res
=
s
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
verify
=
False
)
# 获得结果为json格式
res_text
=
json
.
loads
(
res
.
text
)
page_list
=
res_text
[
'result'
][
'data'
][
'list'
]
s
.
close
()
for
page
in
page_list
:
# 获取所需信息
title
=
page
[
'maintitle'
]
pub_time1
=
page
[
'publish_time'
]
pub_time2
=
page
[
'cwrq'
]
pub_code
=
page
[
'fwzh'
]
href
=
page
[
'pub_url'
]
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
continue
try
:
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
try
:
resp_href
.
encoding
=
resp_href
.
apparent_encoding
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
i_html
=
resp_href
.
text
except
:
if
'您访问的页面不存在或已删除'
in
i_html
:
s
.
close
()
continue
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
for
page
in
page_list
:
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
id_list
=
[]
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 获取所需信息
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
title
=
page
[
'maintitle'
]
# 标题
content
=
str
(
i_soup
.
find
(
'table'
,
attrs
=
{
'class'
:
'pages_content'
}))
pub_time1
=
page
[
'publish_time'
]
# 发布时间
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
pub_time2
=
page
[
'cwrq'
]
# 成文时间
fu_jian_href_list
=
[]
pub_code
=
page
[
'fwzh'
]
# 发文字号
if
len
(
fu_jian_result
)
>
0
:
href
=
page
[
'pub_url'
]
# 网址
for
fu_jian_re
in
fu_jian_result
:
# 判断是否已经爬取过
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
if
is_href
:
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
log
.
info
(
'已采集----------跳过'
)
fu_jian_href
=
fu_jian_re
continue
fu_jian_href_list
.
append
(
fu_jian_href
)
try
:
result_dict
=
{
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
'标题'
:
title
,
resp_href
.
encoding
=
resp_href
.
apparent_encoding
'来源'
:
''
,
i_html
=
resp_href
.
text
'发文机关'
:
pub_org
,
if
'您访问的页面不存在或已删除'
in
i_html
:
'发文字号'
:
pub_code
,
# log.error(f'{title}...{href}...页面不存在或已删除')
'内容-未去标签'
:
content
,
continue
'附件网址'
:
fu_jian_href_list
,
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
'发布时间'
:
pub_time1
,
i_soup
=
paserUrl
(
i_soup
,
href
)
'成文时间'
:
pub_time2
,
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
'主题分类'
:
child_type
,
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
'网址'
:
href
,
0
]
# 发文机关
'归属'
:
pcodeJiguan
,
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 主题分类
'信息来源'
:
'国务院文件'
,
contentWithTag
=
i_soup
.
find
(
'div'
,
class_
=
'wrap mxxgkwrap mxxgkwrap_gwywj'
)
.
find
(
'table'
,
class_
=
'border-table noneBorder pages_content'
)
'tid'
:
1766
,
# 去除扫一扫
}
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
resp_href
.
close
()
content
=
contentWithTag
.
text
# 不带标签正文
print
(
title
)
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
# save_data(result_dict)
time
.
sleep
(
0.5
)
# time.sleep(1)
for
file
in
fu_jian_soup
:
num
+=
1
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1766'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
[
0
],
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
except
:
pass
log
.
error
(
f
'{pcodeJiguan}...第{pageNo}页获取列表失败'
)
except
:
continue
pass
except
:
end_time
=
time
.
time
()
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
continue
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{start_time - end_time}'
)
# 国务院部门文件
# 国务院部门文件
def
get_content2
():
def
get_content2
():
def
getTotalpage
(
bmfl
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
pageNo
=
1
time
.
sleep
(
2
)
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
totalpage
=
resp_json
[
'searchVO'
][
'totalpage'
]
return
totalpage
def
getContentList
(
bmfl
,
pageNo
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp
=
session
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
,
proxies
=
ip
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
return
content_list
session
=
requests
.
session
()
session
.
mount
(
'https://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
mount
(
'http://'
,
HTTPAdapter
(
max_retries
=
3
))
session
.
keep_alive
=
False
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
...
@@ -271,171 +359,261 @@ def get_content2():
...
@@ -271,171 +359,261 @@ def get_content2():
for
bmfl
in
result_list
:
for
bmfl
in
result_list
:
try
:
try
:
pageNo
=
0
totalpage
=
getTotalpage
(
bmfl
,
headers
,
session
)
time
.
sleep
(
2
)
for
pageNo
in
range
(
1
,
totalpage
+
1
):
# 拼接url
url_
=
f
'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
try
:
# 请求结果为json格式
resp
=
requests
.
get
(
url
=
url_
,
headers
=
headers
,
verify
=
False
)
resp_text
=
resp
.
text
resp_json
=
json
.
loads
(
resp_text
)
content_list
=
resp_json
[
'searchVO'
][
'listVO'
]
resp
.
close
()
except
:
continue
for
content_dict
in
content_list
:
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
try
:
try
:
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
try
:
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
except
:
except
:
pub_time1
=
''
session
.
close
()
try
:
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
for
content_dict
in
content_list
:
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
id_list
=
[]
except
:
href
=
content_dict
[
'url'
]
# 详情页
pub_time2
=
''
title
=
content_dict
[
'title'
]
# 标题
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
try
:
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
pub_time1
=
''
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
pub_time2
=
''
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
child_type
=
''
# # 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
time
.
sleep
(
0.5
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
content
=
contentWithTag
.
text
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1699'
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
except
:
print
(
f
'{title}...{href}获取内容失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
except
:
child_type
=
''
print
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
continue
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
'utf-8'
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
time
.
sleep
(
1
)
content
=
str
(
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
}))
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
content
)
fu_jian_href_list
=
[]
if
len
(
fu_jian_result
)
>
0
:
for
fu_jian_re
in
fu_jian_result
:
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
or
'.rar'
in
fu_jian_re
or
'.ppt'
in
fu_jian_re
or
'.PDF'
in
fu_jian_re
or
'.DOC'
in
fu_jian_re
\
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
fu_jian_href
=
href
.
split
(
'content'
)[
0
]
+
fu_jian_re
fu_jian_href_list
.
append
(
fu_jian_href
)
resp
.
close
()
result_dict
=
{
'标题'
:
title
,
'来源'
:
''
,
'发文机关'
:
pub_org
,
'发文字号'
:
pub_code
,
'内容-未去标签'
:
content
,
'附件网址'
:
fu_jian_href_list
,
'发布时间'
:
pub_time1
,
'成文时间'
:
pub_time2
,
'主题分类'
:
child_type
,
'网址'
:
href
,
'归属'
:
bmfl
,
'信息来源'
:
'国务院部门文件'
,
'tid'
:
1699
,
}
print
(
title
)
save_data
(
result_dict
)
num
+=
1
except
:
pass
except
:
except
:
pass
print
(
f
'{bmfl}...获取页数失败'
)
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
# 国务院国有资产监督管理委员会-政策发布
# 国务院国有资产监督管理委员会-政策发布
def
get_content3
():
def
get_content3
():
start_time
=
time
.
time
()
def
getPage
():
num
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
)
try
:
req
.
encoding
=
req
.
apparent_encoding
# get请求,需要取消ssl验证
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
totalpage
=
re
.
findall
(
"maxPageNum = (.*);"
,
soup
.
select
(
'#pag_2603340'
)[
0
]
.
text
)[
0
]
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
return
int
(
totalpage
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.zsy_conlist li'
)
.
items
()
def
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
):
time
.
sleep
(
1
)
id_list
=
[]
for
doc_item
in
doc_items
:
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
# 获取所需数据
resp_href
.
encoding
=
resp_href
.
apparent_encoding
soup
=
BeautifulSoup
(
resp_href
.
text
,
'lxml'
)
soup
=
paserUrl
(
soup
,
href
)
doc_href
=
soup
.
find
(
'div'
,
class_
=
'zsy_content'
)
try
:
org_content
=
doc_href
.
select
(
'.zsy_cotitle'
)[
0
]
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zsy_comain'
)
contentWithTag
.
select
(
'#qr_container'
)[
0
]
.
decompose
()
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'related'
)
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'jiathis_style_24x24'
)
.
decompose
()
try
:
p_list
=
contentWithTag
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
15
:
pub_hao
=
''
content
=
contentWithTag
.
text
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
try
:
href_
=
doc_item
(
'a'
)
.
attr
(
'href'
)
file_href
=
file
[
'href'
]
if
href_
is
None
:
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
uploadToserver
(
file_href
,
'1642'
)
if
retData
[
'state'
]:
pass
else
:
continue
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1642"
,
'relationName'
:
"国务院国资委"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
org
,
#政策发文机关
'topicClassification'
:
''
,
#政策文件分类
'issuedNumber'
:
pub_hao
,
#发文字号
'publishDate'
:
pub_time
,
#发布时间
'writtenDate'
:
''
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
def
partTwo
():
start_time
=
time
.
time
()
num
=
0
totalpage
=
getPage
()
for
page
in
range
(
1
,
totalpage
):
url
=
f
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
li_list
=
resp_text
.
split
(
'<li>'
)
del
(
li_list
[
0
])
for
li
in
li_list
:
id_list
=
[]
href_
=
li
.
split
(
'<a href="'
)[
1
]
.
split
(
'" target='
)[
0
]
title
=
li
.
split
(
'title="'
)[
1
]
.
split
(
'">'
)[
0
]
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过
pub_time
=
li
.
split
(
'<span>['
)[
1
]
.
split
(
']</span>'
)[
0
]
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
num
+=
1
except
:
end_time
=
time
.
time
()
continue
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
try
:
def
partOne
():
start_time
=
time
.
time
()
num
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try
:
# get请求,需要取消ssl验证
href_resp
=
requests
.
request
(
"GET"
,
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
href_resp
.
content
.
decode
(
'UTF-8'
)
doc_resp
=
pq
(
resp_text
)
doc_items
=
doc_resp
(
'.zsy_conlist li'
)
.
items
()
time
.
sleep
(
1
)
for
doc_item
in
doc_items
:
# 获取所需数据
try
:
try
:
resp_href
=
requests
.
request
(
"GET"
,
href
,
headers
=
headers
,
verify
=
False
)
href_
=
doc_item
(
'a'
)
.
attr
(
'href'
)
doc_href
=
pq
(
resp_href
.
content
)
if
href_
is
None
:
time
.
sleep
(
1
)
continue
content_html
=
str
(
doc_href
(
'.zsy_comain'
)
.
remove
(
'style'
)
.
remove
(
'#qr_container'
))
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
content
=
pq
(
content_html
)
.
text
()
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
except
:
except
:
continue
continue
if
content
.
strip
()
==
''
:
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
continue
try
:
org_content
=
doc_href
(
'.zsy_cotitle'
)
.
text
()
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
except
:
org
=
''
try
:
resp_href
.
encoding
=
'utf-8'
resp_text_
=
BeautifulSoup
(
resp_href
.
text
,
'html.parser'
)
zsy_comain
=
resp_text_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_comain'
})
p_list
=
zsy_comain
.
findAll
(
'p'
)
pub_hao
=
''
for
p
in
p_list
:
p
=
str
(
p
.
text
)
if
'号'
in
p
and
'〔'
in
p
and
'〕'
in
p
or
'['
in
p
and
']'
in
p
and
'号'
in
p
or
'【'
in
p
and
'】'
in
p
and
'号'
in
p
:
try
:
pub_hao
=
p
.
split
(
'日'
)[
1
]
.
split
(
'自'
)[
0
]
.
strip
()
.
lstrip
()
except
:
pub_hao
=
p
.
strip
()
.
lstrip
()
break
except
:
pub_hao
=
''
if
len
(
pub_hao
)
>
45
:
pub_hao
=
''
result_dict
=
{
'标题'
:
title
,
'来源'
:
org
,
'发文机关'
:
''
,
'发文字号'
:
pub_hao
,
'内容-未去标签'
:
content_html
,
'附件网址'
:
[],
'发布时间'
:
pub_time
,
'成文时间'
:
''
,
'主题分类'
:
''
,
'网址'
:
href
,
'归属'
:
'国务院国资委'
,
'信息来源'
:
'国务院国资委'
,
'tid'
:
1642
,
}
save_data
(
result_dict
)
print
(
title
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
except
:
end_time
=
time
.
time
()
pass
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
partOne
()
partTwo
()
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
urllib.parse
import
urljoin
...
@@ -580,8 +758,9 @@ def bei_jing():
...
@@ -580,8 +758,9 @@ def bei_jing():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
...
@@ -698,9 +877,10 @@ def nei_meng_gu():
...
@@ -698,9 +877,10 @@ def nei_meng_gu():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
...
@@ -890,8 +1070,9 @@ def ji_lin():
...
@@ -890,8 +1070,9 @@ def ji_lin():
continue
continue
else
:
else
:
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
...
@@ -1024,8 +1205,9 @@ def shang_hai():
...
@@ -1024,8 +1205,9 @@ def shang_hai():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
...
@@ -1143,8 +1325,9 @@ def zhe_jiang():
...
@@ -1143,8 +1325,9 @@ def zhe_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
...
@@ -1301,8 +1484,9 @@ def fu_jian():
...
@@ -1301,8 +1484,9 @@ def fu_jian():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
+=
1
num
+=
1
except
:
except
:
...
@@ -1410,8 +1594,9 @@ def shan_dong():
...
@@ -1410,8 +1594,9 @@ def shan_dong():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
if
content
==
''
or
content
==
'None'
:
if
content
==
''
or
content
==
'None'
:
continue
continue
else
:
else
:
...
@@ -1512,8 +1697,9 @@ def guang_dong():
...
@@ -1512,8 +1697,9 @@ def guang_dong():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
num
=
num
+
1
num
=
num
+
1
...
@@ -1697,8 +1883,9 @@ def hai_nan():
...
@@ -1697,8 +1883,9 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -1768,8 +1955,9 @@ def hai_nan():
...
@@ -1768,8 +1955,9 @@ def hai_nan():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
print
(
title
)
...
@@ -1873,8 +2061,9 @@ def hai_nan():
...
@@ -1873,8 +2061,9 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
print
(
title
)
...
@@ -1979,8 +2168,9 @@ def hai_nan():
...
@@ -1979,8 +2168,9 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
print
(
title
)
...
@@ -2065,8 +2255,9 @@ def hai_nan():
...
@@ -2065,8 +2255,9 @@ def hai_nan():
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
print
(
title
)
...
@@ -2238,8 +2429,9 @@ def si_chuan():
...
@@ -2238,8 +2429,9 @@ def si_chuan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -2363,8 +2555,9 @@ def guang_xi():
...
@@ -2363,8 +2555,9 @@ def guang_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
...
@@ -2471,8 +2664,9 @@ def gui_zhou():
...
@@ -2471,8 +2664,9 @@ def gui_zhou():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
num
=
num
+
1
num
=
num
+
1
...
@@ -2584,8 +2778,9 @@ def yun_nan():
...
@@ -2584,8 +2778,9 @@ def yun_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
...
@@ -2696,8 +2891,9 @@ def yun_nan():
...
@@ -2696,8 +2891,9 @@ def yun_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
# sendKafka(dic_news)
flag
=
sendKafka
(
dic_news
)
# save_data(dic_news)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
num
=
num
+
1
num
=
num
+
1
...
@@ -2826,8 +3022,9 @@ def chong_qing():
...
@@ -2826,8 +3022,9 @@ def chong_qing():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
print
(
title
)
print
(
title
)
# save_data(result_dict)
# save_data(result_dict)
num
+=
1
num
+=
1
...
@@ -2951,8 +3148,9 @@ def tian_jin():
...
@@ -2951,8 +3148,9 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3073,8 +3271,9 @@ def tian_jin():
...
@@ -3073,8 +3271,9 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3199,8 +3398,9 @@ def tian_jin():
...
@@ -3199,8 +3398,9 @@ def tian_jin():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3306,8 +3506,9 @@ def xin_jiang():
...
@@ -3306,8 +3506,9 @@ def xin_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3403,8 +3604,9 @@ def xin_jiang():
...
@@ -3403,8 +3604,9 @@ def xin_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
except
:
except
:
...
@@ -3521,8 +3723,9 @@ def shan_xi():
...
@@ -3521,8 +3723,9 @@ def shan_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3630,8 +3833,9 @@ def liao_ning():
...
@@ -3630,8 +3833,9 @@ def liao_ning():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3723,8 +3927,9 @@ def hei_long_jiang():
...
@@ -3723,8 +3927,9 @@ def hei_long_jiang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3836,8 +4041,9 @@ def jiang_su():
...
@@ -3836,8 +4041,9 @@ def jiang_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -3930,8 +4136,9 @@ def an_hui():
...
@@ -3930,8 +4136,9 @@ def an_hui():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -4025,8 +4232,9 @@ def an_hui():
...
@@ -4025,8 +4232,9 @@ def an_hui():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
except
:
except
:
...
@@ -4158,8 +4366,9 @@ def jiang_xi():
...
@@ -4158,8 +4366,9 @@ def jiang_xi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -4250,8 +4459,9 @@ def he_nan():
...
@@ -4250,8 +4459,9 @@ def he_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
href_res
.
close
()
href_res
.
close
()
resp_text
.
close
()
resp_text
.
close
()
...
@@ -4351,8 +4561,9 @@ def hu_nan():
...
@@ -4351,8 +4561,9 @@ def hu_nan():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -4472,8 +4683,9 @@ def gan_su():
...
@@ -4472,8 +4683,9 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
...
@@ -4607,8 +4819,9 @@ def gan_su():
...
@@ -4607,8 +4819,9 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
...
@@ -4763,8 +4976,9 @@ def gan_su():
...
@@ -4763,8 +4976,9 @@ def gan_su():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
...
@@ -4862,8 +5076,9 @@ def ning_xia():
...
@@ -4862,8 +5076,9 @@ def ning_xia():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -4960,8 +5175,9 @@ def shanxi():
...
@@ -4960,8 +5175,9 @@ def shanxi():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
res_href
.
close
()
res_href
.
close
()
except
:
except
:
...
@@ -5053,8 +5269,9 @@ def xi_zang():
...
@@ -5053,8 +5269,9 @@ def xi_zang():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -5148,8 +5365,9 @@ def qing_hai():
...
@@ -5148,8 +5365,9 @@ def qing_hai():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
...
@@ -5265,8 +5483,9 @@ def qing_hai():
...
@@ -5265,8 +5483,9 @@ def qing_hai():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
...
@@ -5363,8 +5582,9 @@ def he_bei():
...
@@ -5363,8 +5582,9 @@ def he_bei():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
:
except
:
pass
pass
...
@@ -5471,8 +5691,9 @@ def hu_bei():
...
@@ -5471,8 +5691,9 @@ def hu_bei():
'title'
:
title
'title'
:
title
}
}
# print(dic_news)
# print(dic_news)
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
save_data
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
num
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
pass
pass
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论