Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
03de4b81
提交
03de4b81
authored
10月 11, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规调整上传附件方式
上级
d5ee8877
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
388 行增加
和
152 行删除
+388
-152
policy.py
comData/policylaw/policy.py
+388
-152
没有找到文件。
comData/policylaw/policy.py
浏览文件 @
03de4b81
...
@@ -91,7 +91,8 @@ def save_data(dic_news):
...
@@ -91,7 +91,8 @@ def save_data(dic_news):
'网址'
:
dic_news
[
'sourceAddress'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
]
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
]
}
}
db_storage
.
insert_one
(
aaa_dic
)
db_storage
.
insert_one
(
aaa_dic
)
...
@@ -138,6 +139,7 @@ def remove_dup():
...
@@ -138,6 +139,7 @@ def remove_dup():
# 国务院文件
# 国务院文件
def
get_content1
():
def
get_content1
():
pathType
=
'policy/gwywj/'
def
getPageConunt
(
a_list
,
url
,
headers
,
s
):
def
getPageConunt
(
a_list
,
url
,
headers
,
s
):
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
data
=
{
"code"
:
"18122f54c5c"
,
"thirdPartyCode"
:
"thirdparty_code_107"
,
"thirdPartyTableId"
:
30
,
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
"resultFields"
:
[
"pub_url"
,
"maintitle"
,
"fwzh"
,
"cwrq"
,
"publish_time"
],
...
@@ -256,7 +258,7 @@ def get_content1():
...
@@ -256,7 +258,7 @@ def get_content1():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1766'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1766'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -265,7 +267,7 @@ def get_content1():
...
@@ -265,7 +267,7 @@ def get_content1():
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
except
:
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
continue
...
@@ -308,6 +310,7 @@ def get_content1():
...
@@ -308,6 +310,7 @@ def get_content1():
# 国务院部门文件
# 国务院部门文件
def
get_content2
():
def
get_content2
():
pathType
=
'policy/gwybmwj/'
def
getTotalpage
(
bmfl
,
headers
,
session
):
def
getTotalpage
(
bmfl
,
headers
,
session
):
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
pageNo
=
1
pageNo
=
1
...
@@ -336,6 +339,7 @@ def get_content2():
...
@@ -336,6 +339,7 @@ def get_content2():
session
.
keep_alive
=
False
session
.
keep_alive
=
False
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
count
=
0
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
result_list
=
[
'外交部'
,
'国家发展和改革委员会'
,
'教育部'
,
'科学技术部'
,
'工业和信息化部'
,
'国家民族事务委员会'
,
'公安部'
,
'国家安全部'
,
'民政部'
,
'司法部'
,
'财政部'
,
'人力资源和社会保障部'
,
'自然资源部'
,
'生态环境部'
,
'住房和城乡建设部'
,
'交通运输部'
,
'水利部'
,
'农业农村部'
,
'商务部'
,
'文化和旅游部'
,
'人力资源和社会保障部'
,
'自然资源部'
,
'生态环境部'
,
'住房和城乡建设部'
,
'交通运输部'
,
'水利部'
,
'农业农村部'
,
'商务部'
,
'文化和旅游部'
,
'国家卫生健康委员会'
,
'国家卫生健康委员会'
,
...
@@ -396,6 +400,9 @@ def get_content2():
...
@@ -396,6 +400,9 @@ def get_content2():
time
.
sleep
(
0.5
)
time
.
sleep
(
0.5
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'----{href}---{title}---内容为空---'
)
continue
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
try
:
try
:
...
@@ -407,7 +414,7 @@ def get_content2():
...
@@ -407,7 +414,7 @@ def get_content2():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1699'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1699'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -416,7 +423,7 @@ def get_content2():
...
@@ -416,7 +423,7 @@ def get_content2():
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
except
:
except
:
log
.
error
(
f
'{title}...{href}获取内容失败'
)
log
.
error
(
f
'{title}...{href}获取内容失败'
)
continue
continue
...
@@ -446,6 +453,7 @@ def get_content2():
...
@@ -446,6 +453,7 @@ def get_content2():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
count
+=
1
num
+=
1
num
+=
1
except
:
except
:
log
.
error
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
log
.
error
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
...
@@ -454,10 +462,11 @@ def get_content2():
...
@@ -454,10 +462,11 @@ def get_content2():
log
.
error
(
f
'{bmfl}...获取页数失败'
)
log
.
error
(
f
'{bmfl}...获取页数失败'
)
continue
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院部门文件{
num
}条数据,耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取国务院部门文件{
count
}条数据,耗时{end_time - start_time}'
)
# 国务院国有资产监督管理委员会-政策发布
# 国务院国有资产监督管理委员会-政策发布
def
get_content3
():
def
get_content3
():
pathType
=
'policy/gyzc/'
def
getPage
():
def
getPage
():
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
)
req
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
)
...
@@ -499,6 +508,9 @@ def get_content3():
...
@@ -499,6 +508,9 @@ def get_content3():
if
len
(
pub_hao
)
>
15
:
if
len
(
pub_hao
)
>
15
:
pub_hao
=
''
pub_hao
=
''
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'----{href}----{title}----内容为空----'
)
return
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
try
:
try
:
...
@@ -510,7 +522,7 @@ def get_content3():
...
@@ -510,7 +522,7 @@ def get_content3():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1642'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1642'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -519,7 +531,7 @@ def get_content3():
...
@@ -519,7 +531,7 @@ def get_content3():
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
#todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -542,7 +554,7 @@ def get_content3():
...
@@ -542,7 +554,7 @@ def get_content3():
'summary'
:
''
,
#摘要
'summary'
:
''
,
#摘要
'title'
:
title
#标题
'title'
:
title
#标题
}
}
#
print
(title)
#
log.info
(title)
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
...
@@ -550,6 +562,7 @@ def get_content3():
...
@@ -550,6 +562,7 @@ def get_content3():
def
partTwo
():
def
partTwo
():
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
count
=
0
totalpage
=
getPage
()
totalpage
=
getPage
()
for
page
in
range
(
1
,
totalpage
):
for
page
in
range
(
1
,
totalpage
):
url
=
f
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
url
=
f
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
...
@@ -570,12 +583,14 @@ def get_content3():
...
@@ -570,12 +583,14 @@ def get_content3():
continue
continue
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
num
+=
1
num
+=
1
count
+=
1
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国资委文件{
num
}条数据,耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取国资委文件{
count
}条数据,耗时{end_time - start_time}'
)
def
partOne
():
def
partOne
():
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num
=
0
num
=
0
count
=
0
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
url
=
"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try
:
try
:
# get请求,需要取消ssl验证
# get请求,需要取消ssl验证
...
@@ -603,10 +618,11 @@ def get_content3():
...
@@ -603,10 +618,11 @@ def get_content3():
continue
continue
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
sendContent
(
href
,
headers
,
title
,
pub_time
,
num
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国资委文件{
num
}条数据,耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取国资委文件{
count
}条数据,耗时{end_time - start_time}'
)
partOne
()
partOne
()
# 增量执行需要注释掉partTwo()
# 增量执行需要注释掉partTwo()
...
@@ -614,7 +630,7 @@ def get_content3():
...
@@ -614,7 +630,7 @@ def get_content3():
# 北京
# 北京
def
bei_jing
():
def
bei_jing
():
num
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
pathType
=
'policy/beijing/'
pathType
=
'policy/beijing/'
# 有反爬需要使用selenium
# 有反爬需要使用selenium
...
@@ -662,6 +678,7 @@ def bei_jing():
...
@@ -662,6 +678,7 @@ def bei_jing():
time
.
sleep
(
2
)
time
.
sleep
(
2
)
log
.
info
(
f
'------{len(hrefs)}条数据-------------'
)
log
.
info
(
f
'------{len(hrefs)}条数据-------------'
)
num
=
0
num
=
0
count
=
0
for
href
in
hrefs
:
for
href
in
hrefs
:
id_list
=
[]
id_list
=
[]
title
=
href
[
1
]
title
=
href
[
1
]
...
@@ -700,12 +717,15 @@ def bei_jing():
...
@@ -700,12 +717,15 @@ def bei_jing():
soup
=
paserUrl
(
soup_cont
,
href
[
0
])
soup
=
paserUrl
(
soup_cont
,
href
[
0
])
soup
.
prettify
()
soup
.
prettify
()
if
soup
.
text
==
''
or
soup
.
text
==
'None'
:
log
.
info
(
f
'----{href[0]}----{title}----内容为空----'
)
continue
# todo:去掉扫一扫
# todo:去掉扫一扫
try
:
try
:
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
except
:
except
:
continue
continue
#
print
(title)
#
log.info
(title)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
...
@@ -756,11 +776,10 @@ def bei_jing():
...
@@ -756,11 +776,10 @@ def bei_jing():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
# print(id)
# id_list.append(id)
num
+=
1
num
+=
1
count
+=
1
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
bro
.
quit
()
bro
.
quit
()
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
e
)
log
.
info
(
e
)
...
@@ -827,6 +846,9 @@ def nei_meng_gu():
...
@@ -827,6 +846,9 @@ def nei_meng_gu():
else
:
else
:
i_content
=
i_soup
.
find
(
class_
=
'view TRS_UEDITOR trs_paper_default'
)
i_content
=
i_soup
.
find
(
class_
=
'view TRS_UEDITOR trs_paper_default'
)
content
=
str
(
i_content
)
content
=
str
(
i_content
)
if
i_content
.
text
==
''
or
i_content
.
text
==
'None'
:
log
.
info
(
f
'{real_href}------{title}----内容为空-----'
)
continue
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian
=
i_soup
.
find
(
class_
=
'xy_zcwjxl_downloadPC_list'
)
fujian
=
i_soup
.
find
(
class_
=
'xy_zcwjxl_downloadPC_list'
)
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
str
(
fujian
))
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
str
(
fujian
))
...
@@ -849,7 +871,7 @@ def nei_meng_gu():
...
@@ -849,7 +871,7 @@ def nei_meng_gu():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'内蒙古自治区国资委'
,
title
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'内蒙古自治区国资委'
,
title
,
num
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
print
(
title
)
log
.
info
(
title
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
...
@@ -892,6 +914,7 @@ def ji_lin():
...
@@ -892,6 +914,7 @@ def ji_lin():
pathType
=
'policy/jilin/'
pathType
=
'policy/jilin/'
start
=
time
.
time
()
start
=
time
.
time
()
num
=
0
num
=
0
count
=
0
url
=
'http://gzw.jl.gov.cn/zwgk/zcwj/'
url
=
'http://gzw.jl.gov.cn/zwgk/zcwj/'
try
:
try
:
resp_text
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
resp_text
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
...
@@ -964,6 +987,9 @@ def ji_lin():
...
@@ -964,6 +987,9 @@ def ji_lin():
i_content
=
soup
i_content
=
soup
contentWithTag
=
soup
.
find
(
class_
=
'zsy_comain'
)
contentWithTag
=
soup
.
find
(
class_
=
'zsy_comain'
)
content
=
contentWithTag
.
text
.
strip
()
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{real_href}-----{title}----内容为空'
)
continue
# 发文字号
# 发文字号
find_hao
=
i_content
.
find_all
(
'p'
)[:
3
]
find_hao
=
i_content
.
find_all
(
'p'
)[:
3
]
pub_hao
=
''
pub_hao
=
''
...
@@ -1010,6 +1036,9 @@ def ji_lin():
...
@@ -1010,6 +1036,9 @@ def ji_lin():
p
.
extract
()
p
.
extract
()
contentWithTag
=
i_content
contentWithTag
=
i_content
content
=
contentWithTag
.
text
.
strip
()
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{real_href}-----{title}----内容为空'
)
continue
# 找到附件上传至文件服务器
# 找到附件上传至文件服务器
fj_soup
=
i_soup
.
find
(
'div'
,
class_
=
'wenjianfujian'
)
fj_soup
=
i_soup
.
find
(
'div'
,
class_
=
'wenjianfujian'
)
fj_list
=
fj_soup
.
find_all
(
'a'
)
fj_list
=
fj_soup
.
find_all
(
'a'
)
...
@@ -1040,7 +1069,7 @@ def ji_lin():
...
@@ -1040,7 +1069,7 @@ def ji_lin():
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
else
:
else
:
pass
pass
print
(
title
)
log
.
info
(
title
)
# print('............................................................')
# print('............................................................')
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
...
@@ -1073,13 +1102,14 @@ def ji_lin():
...
@@ -1073,13 +1102,14 @@ def ji_lin():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
e
)
log
.
info
(
e
)
pass
pass
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 上海
# 上海
...
@@ -1087,6 +1117,7 @@ def shang_hai():
...
@@ -1087,6 +1117,7 @@ def shang_hai():
start
=
time
.
time
()
start
=
time
.
time
()
pathType
=
'policy/shanghai/'
pathType
=
'policy/shanghai/'
num
=
0
num
=
0
count
=
0
for
page
in
range
(
1
,
7
):
for
page
in
range
(
1
,
7
):
if
page
==
1
:
if
page
==
1
:
...
@@ -1111,7 +1142,7 @@ def shang_hai():
...
@@ -1111,7 +1142,7 @@ def shang_hai():
num
+=
1
num
+=
1
continue
continue
try
:
try
:
href
=
'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
#
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
href_text
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
.
text
href_text
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
.
text
doc_href
=
pq
(
href_text
)
doc_href
=
pq
(
href_text
)
doc_href_
=
BeautifulSoup
(
href_text
,
'html.parser'
)
doc_href_
=
BeautifulSoup
(
href_text
,
'html.parser'
)
...
@@ -1120,6 +1151,9 @@ def shang_hai():
...
@@ -1120,6 +1151,9 @@ def shang_hai():
info_list
=
doc_href_
.
find_all
(
'span'
,
style
=
'text-align: center;margin-left: 42
%
;'
)
info_list
=
doc_href_
.
find_all
(
'span'
,
style
=
'text-align: center;margin-left: 42
%
;'
)
pub_source
=
info_list
[
1
]
.
find
(
'b'
)
.
text
.
split
(
'信息来源:'
)[
1
]
pub_source
=
info_list
[
1
]
.
find
(
'b'
)
.
text
.
split
(
'信息来源:'
)[
1
]
content
=
doc_href_
.
find
(
'div'
,
attrs
=
{
'class'
:
'detail_03'
})
content
=
doc_href_
.
find
(
'div'
,
attrs
=
{
'class'
:
'detail_03'
})
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{href}-----{title}----内容为空'
)
continue
# 将文章中的附件字段删去
# 将文章中的附件字段删去
pattern
=
r'\d+\.'
pattern
=
r'\d+\.'
...
@@ -1181,7 +1215,7 @@ def shang_hai():
...
@@ -1181,7 +1215,7 @@ def shang_hai():
else
:
else
:
continue
continue
print
(
title
)
log
.
info
(
title
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
...
@@ -1209,18 +1243,19 @@ def shang_hai():
...
@@ -1209,18 +1243,19 @@ def shang_hai():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 浙江
# 浙江
def
zhe_jiang
():
def
zhe_jiang
():
start
=
time
.
time
()
start
=
time
.
time
()
pathType
=
'policy/zhejiang/'
num
=
0
num
=
0
count
=
0
url
=
'http://gzw.zj.gov.cn/col/col1229430928/index.html'
url
=
'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try
:
try
:
res
=
requests
.
get
(
url
,
headers
)
.
content
res
=
requests
.
get
(
url
,
headers
)
.
content
...
@@ -1235,7 +1270,7 @@ def zhe_jiang():
...
@@ -1235,7 +1270,7 @@ def zhe_jiang():
href
=
li
.
find
(
'a'
)[
'href'
]
href
=
li
.
find
(
'a'
)[
'href'
]
pub_time
=
li
.
find
(
'a'
)
.
find
(
'span'
)
.
text
pub_time
=
li
.
find
(
'a'
)
.
find
(
'span'
)
.
text
title
=
li
.
find
(
'a'
)
.
text
.
replace
(
pub_time
,
''
)
.
strip
()
title
=
li
.
find
(
'a'
)
.
text
.
replace
(
pub_time
,
''
)
.
strip
()
#
print
(title)
#
log.info
(title)
if
'http'
in
href
:
if
'http'
in
href
:
href
=
href
href
=
href
else
:
else
:
...
@@ -1302,9 +1337,12 @@ def zhe_jiang():
...
@@ -1302,9 +1337,12 @@ def zhe_jiang():
# fj_href_list.append(fujian_href)
# fj_href_list.append(fujian_href)
# print(fj_href_list)
# print(fj_href_list)
print
(
title
)
log
.
info
(
title
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{href}-----{title}----内容为空'
)
continue
dic_news
=
{
dic_news
=
{
'attachmentIds'
:
[],
'attachmentIds'
:
[],
'author'
:
''
,
'author'
:
''
,
...
@@ -1329,20 +1367,21 @@ def zhe_jiang():
...
@@ -1329,20 +1367,21 @@ def zhe_jiang():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 福建
# 福建
def
fu_jian
():
def
fu_jian
():
error_tag
=
str
(
404
)
error_tag
=
str
(
404
)
pathType
=
'policy/fujian/'
pathType
=
'policy/fujian/'
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url
=
'http://gzw.fujian.gov.cn/zwgk/zcfg/'
url
=
'http://gzw.fujian.gov.cn/zwgk/zcfg/'
try
:
try
:
...
@@ -1386,8 +1425,8 @@ def fu_jian():
...
@@ -1386,8 +1425,8 @@ def fu_jian():
i_html
=
href_text
.
text
i_html
=
href_text
.
text
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
real_href
=
href
real_href
=
href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/
xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610
.htm'
# real_href = 'http://gzw.fujian.gov.cn/zwgk/
zcfg/201806/t20180619_3065065
.htm'
#
print(real_href)
print
(
real_href
)
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
if
is_href
:
num
+=
1
num
+=
1
...
@@ -1437,6 +1476,7 @@ def fu_jian():
...
@@ -1437,6 +1476,7 @@ def fu_jian():
if
'.doc'
in
fj_href
or
'.docx'
in
fj_href
or
'.xlsx'
in
fj_href
or
'.pdf'
in
fj_href
or
'.xls'
in
fj_href
or
'.zip'
in
fj_href
\
if
'.doc'
in
fj_href
or
'.docx'
in
fj_href
or
'.xlsx'
in
fj_href
or
'.pdf'
in
fj_href
or
'.xls'
in
fj_href
or
'.zip'
in
fj_href
\
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
print
(
fj_href
)
# 找到附件后 上传至文件服务器
# 找到附件后 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1673'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1673'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -1453,6 +1493,9 @@ def fu_jian():
...
@@ -1453,6 +1493,9 @@ def fu_jian():
pub_time
=
source_
.
split
(
'发布时间:'
)[
1
]
.
split
(
'浏览量:'
)[
0
]
.
strip
()
.
lstrip
()
pub_time
=
source_
.
split
(
'发布时间:'
)[
1
]
.
split
(
'浏览量:'
)[
0
]
.
strip
()
.
lstrip
()
contentwithtag
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'xl_con1'
})
contentwithtag
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'xl_con1'
})
content
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'xl_con1'
})
.
text
content
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'xl_con1'
})
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pub_hao
=
''
pub_hao
=
''
except
:
except
:
...
@@ -1460,6 +1503,9 @@ def fu_jian():
...
@@ -1460,6 +1503,9 @@ def fu_jian():
pub_time
=
''
pub_time
=
''
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
content
=
contentwithtag
.
text
.
strip
()
content
=
contentwithtag
.
text
.
strip
()
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pub_hao
=
contentwithtag
.
find_all
(
'div'
,
class_
=
'rules_tit1 b-free-read-leaf'
)
.
text
.
dtrip
()
pub_hao
=
contentwithtag
.
find_all
(
'div'
,
class_
=
'rules_tit1 b-free-read-leaf'
)
.
text
.
dtrip
()
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -1484,18 +1530,19 @@ def fu_jian():
...
@@ -1484,18 +1530,19 @@ def fu_jian():
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
#
print
(dic_news)
#
log.info
(dic_news)
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 山东
# 山东
def
shan_dong
():
def
shan_dong
():
...
@@ -1505,6 +1552,7 @@ def shan_dong():
...
@@ -1505,6 +1552,7 @@ def shan_dong():
}
}
start
=
time
.
time
()
start
=
time
.
time
()
num
=
0
num
=
0
count
=
0
url_list
=
[
'http://gzw.shandong.gov.cn/channels/ch06086/'
,
'http://gzw.shandong.gov.cn/channels/ch06088/'
]
url_list
=
[
'http://gzw.shandong.gov.cn/channels/ch06086/'
,
'http://gzw.shandong.gov.cn/channels/ch06088/'
]
for
url
in
url_list
:
for
url
in
url_list
:
try
:
try
:
...
@@ -1539,6 +1587,9 @@ def shan_dong():
...
@@ -1539,6 +1587,9 @@ def shan_dong():
# print(pub_time,pub_source,pub_hao)
# print(pub_time,pub_source,pub_hao)
content
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
.
text
content
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
.
text
contentwithtag
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
contentwithtag
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
if
pub_hao
==
'无'
:
if
pub_hao
==
'无'
:
p_list
=
content
.
find_all
(
'p'
)
p_list
=
content
.
find_all
(
'p'
)
for
p
in
p_list
:
for
p
in
p_list
:
...
@@ -1571,6 +1622,9 @@ def shan_dong():
...
@@ -1571,6 +1622,9 @@ def shan_dong():
i
=
i
+
1
i
=
i
+
1
content
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
.
text
content
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
.
text
contentwithtag
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
contentwithtag
=
i_soup
.
find
(
class_
=
"wz_zoom scroll_cont ScrollStyle"
)
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -1597,23 +1651,22 @@ def shan_dong():
...
@@ -1597,23 +1651,22 @@ def shan_dong():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
if
content
==
''
or
content
==
'None'
:
log
.
info
(
title
)
continue
else
:
print
(
title
)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 广东
# 广东
def
guang_dong
():
def
guang_dong
():
start
=
time
.
time
()
start
=
time
.
time
()
pathType
=
'policy/guangdong/'
pathType
=
'policy/guangdong/'
num
=
0
num
=
0
count
=
0
url
=
'http://gzw.gd.gov.cn/zcfg/index.html'
url
=
'http://gzw.gd.gov.cn/zcfg/index.html'
try
:
try
:
resp_href
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
resp_href
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
...
@@ -1653,6 +1706,9 @@ def guang_dong():
...
@@ -1653,6 +1706,9 @@ def guang_dong():
i_soup
=
paserUrl
(
i_soup
,
href
)
i_soup
=
paserUrl
(
i_soup
,
href
)
content
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
,
'box_info'
})
content
=
i_soup
.
find
(
'div'
,
attrs
=
{
'class'
,
'box_info'
})
contentwithTag
=
str
(
content
)
contentwithTag
=
str
(
content
)
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'{href}-----{title}----内容为空----'
)
continue
fu_jian_list
=
content
.
find_all
(
'a'
)
fu_jian_list
=
content
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -1701,15 +1757,15 @@ def guang_dong():
...
@@ -1701,15 +1757,15 @@ def guang_dong():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
# save_data(result_dict)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 海南
# 海南
def
hai_nan
():
def
hai_nan
():
...
@@ -1717,6 +1773,7 @@ def hai_nan():
...
@@ -1717,6 +1773,7 @@ def hai_nan():
def
hai_nan1
():
def
hai_nan1
():
# 部门文件
# 部门文件
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
13
):
for
page
in
range
(
13
):
if
page
==
0
:
if
page
==
0
:
...
@@ -1770,6 +1827,9 @@ def hai_nan():
...
@@ -1770,6 +1827,9 @@ def hai_nan():
except
:
except
:
pass
pass
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -1811,6 +1871,9 @@ def hai_nan():
...
@@ -1811,6 +1871,9 @@ def hai_nan():
topicClassification
=
tbody_text
.
split
(
'分 类:'
)[
1
]
.
split
(
'发文机关:'
)[
0
]
.
strip
()
.
lstrip
()
topicClassification
=
tbody_text
.
split
(
'分 类:'
)[
1
]
.
split
(
'发文机关:'
)[
0
]
.
strip
()
.
lstrip
()
contentWithTag
=
source
.
find
(
'div'
,
attrs
=
{
'class'
:
'zx-xxxqy-nr'
})
contentWithTag
=
source
.
find
(
'div'
,
attrs
=
{
'class'
:
'zx-xxxqy-nr'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
source
.
find_all
(
'a'
)
fu_jian_list
=
source
.
find_all
(
'a'
)
try
:
try
:
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
...
@@ -1862,6 +1925,9 @@ def hai_nan():
...
@@ -1862,6 +1925,9 @@ def hai_nan():
topicClassification
=
''
topicClassification
=
''
contentWithTag
=
source
.
find
(
'div'
,
attrs
=
{
'class'
:
'TRS_UEDITOR'
})
contentWithTag
=
source
.
find
(
'div'
,
attrs
=
{
'class'
:
'TRS_UEDITOR'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -1888,19 +1954,20 @@ def hai_nan():
...
@@ -1888,19 +1954,20 @@ def hai_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
count
+=
1
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
hai_nan2
():
def
hai_nan2
():
def
hai_nan_sw
(
page_href
):
def
hai_nan_sw
(
page_href
):
num
=
0
num
=
0
count
=
0
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
...
@@ -1936,6 +2003,9 @@ def hai_nan():
...
@@ -1936,6 +2003,9 @@ def hai_nan():
pub_time
=
str
(
pub_result
[
3
])
.
split
(
'发布日期:</strong>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
pub_result
[
3
])
.
split
(
'发布日期:</strong>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
contentWithTag
=
doc_href
.
find
(
class_
=
'con_cen line mar-t2 xxgk_content_content'
)
contentWithTag
=
doc_href
.
find
(
class_
=
'con_cen line mar-t2 xxgk_content_content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -1961,10 +2031,11 @@ def hai_nan():
...
@@ -1961,10 +2031,11 @@ def hai_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
log
.
info
(
title
)
# save_data(result_dict)
print
(
title
)
num
+=
1
num
+=
1
count
+=
1
href_text
.
close
()
except
:
except
:
pass
pass
req
.
close
()
req
.
close
()
...
@@ -1972,6 +2043,7 @@ def hai_nan():
...
@@ -1972,6 +2043,7 @@ def hai_nan():
def
hai_nan_szf
(
page_href
):
def
hai_nan_szf
(
page_href
):
num
=
0
num
=
0
count
=
0
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
...
@@ -2010,6 +2082,9 @@ def hai_nan():
...
@@ -2010,6 +2082,9 @@ def hai_nan():
pub_time
=
str
(
pub_result
[
3
])
.
split
(
'发布日期:</strong>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
pub_result
[
3
])
.
split
(
'发布日期:</strong>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
contentWithTag
=
doc_href
.
find
(
class_
=
'con_cen line mar-t2 xxgk_content_content'
)
contentWithTag
=
doc_href
.
find
(
class_
=
'con_cen line mar-t2 xxgk_content_content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
except
:
except
:
# print(href)
# print(href)
pub_result
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'line mar-t2 con_div'
})
pub_result
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'line mar-t2 con_div'
})
...
@@ -2021,6 +2096,9 @@ def hai_nan():
...
@@ -2021,6 +2096,9 @@ def hai_nan():
writtenDate
=
''
writtenDate
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -2068,10 +2146,12 @@ def hai_nan():
...
@@ -2068,10 +2146,12 @@ def hai_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
title
)
num
+=
1
count
+=
1
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
num
+=
1
except
:
except
:
pass
pass
req
.
close
()
req
.
close
()
...
@@ -2079,6 +2159,7 @@ def hai_nan():
...
@@ -2079,6 +2159,7 @@ def hai_nan():
def
hai_nan_szfbgt
(
page_href
):
def
hai_nan_szfbgt
(
page_href
):
num
=
0
num
=
0
count
=
0
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
doc_resp
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
...
@@ -2127,6 +2208,9 @@ def hai_nan():
...
@@ -2127,6 +2208,9 @@ def hai_nan():
writtenDate
=
''
writtenDate
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
if
fu_jian_list
:
if
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
...
@@ -2147,7 +2231,7 @@ def hai_nan():
...
@@ -2147,7 +2231,7 @@ def hai_nan():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'海南省国资委'
,
file_name
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'海南省国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
fu_jian
[
'href'
]
=
full_path
fu_jian
[
'href'
]
=
full_path
print
(
f
'----附件:{fu_jian_href}'
)
#
print(f'----附件:{fu_jian_href}')
else
:
else
:
pass
pass
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -2176,10 +2260,10 @@ def hai_nan():
...
@@ -2176,10 +2260,10 @@ def hai_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
log
.
info
(
title
)
# save_data(result_dict)
print
(
title
)
num
+=
1
num
+=
1
count
+=
1
href_text
.
close
()
except
:
except
:
pass
pass
req
.
close
()
req
.
close
()
...
@@ -2187,6 +2271,7 @@ def hai_nan():
...
@@ -2187,6 +2271,7 @@ def hai_nan():
def
hai_nan_zy
(
page_href
):
def
hai_nan_zy
(
page_href
):
num
=
0
num
=
0
count
=
0
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
=
requests
.
get
(
url
=
page_href
,
headers
=
headers
,
verify
=
False
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
doc_resp
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
doc_resp
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
...
@@ -2240,6 +2325,9 @@ def hai_nan():
...
@@ -2240,6 +2325,9 @@ def hai_nan():
pub_hao
=
''
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
class_
=
'pages_content'
)
contentWithTag
=
doc_href
.
find
(
class_
=
'pages_content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{i_href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -2266,10 +2354,12 @@ def hai_nan():
...
@@ -2266,10 +2354,12 @@ def hai_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
title
)
num
+=
1
count
+=
1
href_text
.
close
()
href_text
.
close
()
# save_data(result_dict)
# save_data(result_dict)
print
(
title
)
num
+=
1
except
:
except
:
pass
pass
req
.
close
()
req
.
close
()
...
@@ -2277,6 +2367,7 @@ def hai_nan():
...
@@ -2277,6 +2367,7 @@ def hai_nan():
def
start
():
def
start
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url
=
"https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
url
=
"https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
try
:
try
:
...
@@ -2306,7 +2397,7 @@ def hai_nan():
...
@@ -2306,7 +2397,7 @@ def hai_nan():
else
:
else
:
page_href
=
str
(
url
)
+
f
'home_{page}.htm'
page_href
=
str
(
url
)
+
f
'home_{page}.htm'
try
:
try
:
num
+=
hai_nan_zy
(
page_href
)
count
+=
hai_nan_zy
(
page_href
)
except
:
except
:
pass
pass
time
.
sleep
(
1
)
time
.
sleep
(
1
)
...
@@ -2320,7 +2411,7 @@ def hai_nan():
...
@@ -2320,7 +2411,7 @@ def hai_nan():
else
:
else
:
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
try
:
try
:
num
+=
hai_nan_sw
(
page_href
)
count
+=
hai_nan_sw
(
page_href
)
except
:
except
:
pass
pass
elif
url
==
leibie_href_list
[
2
]:
elif
url
==
leibie_href_list
[
2
]:
...
@@ -2332,7 +2423,7 @@ def hai_nan():
...
@@ -2332,7 +2423,7 @@ def hai_nan():
else
:
else
:
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
try
:
try
:
num
+=
hai_nan_szf
(
page_href
)
count
+=
hai_nan_szf
(
page_href
)
except
:
except
:
pass
pass
else
:
else
:
...
@@ -2343,22 +2434,22 @@ def hai_nan():
...
@@ -2343,22 +2434,22 @@ def hai_nan():
else
:
else
:
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
page_href
=
str
(
url
)
.
split
(
'list3'
)[
0
]
+
'list3_{}.shtml'
.
format
(
page
+
1
)
try
:
try
:
num
+=
hai_nan_szfbgt
(
page_href
)
count
+=
hai_nan_szfbgt
(
page_href
)
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
start
()
start
()
hai_nan1
()
hai_nan1
()
hai_nan2
()
hai_nan2
()
# 四川
# 四川
def
si_chuan
():
def
si_chuan
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/sichuan/'
pathType
=
'policy/sichuan/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
3
):
for
page
in
range
(
1
,
3
):
...
@@ -2393,6 +2484,9 @@ def si_chuan():
...
@@ -2393,6 +2484,9 @@ def si_chuan():
doc_href
=
paserUrl
(
doc_href
,
href
)
doc_href
=
paserUrl
(
doc_href
,
href
)
contentWithTag
=
doc_href
.
find
(
'div'
,
id
=
'scrollBox'
)
contentWithTag
=
doc_href
.
find
(
'div'
,
id
=
'scrollBox'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
doc_href
.
find_all
(
'a'
)
fu_jian_list
=
doc_href
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
...
@@ -2441,19 +2535,20 @@ def si_chuan():
...
@@ -2441,19 +2535,20 @@ def si_chuan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
count
+=
1
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 广西
# 广西
def
guang_xi
():
def
guang_xi
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/guangxi/'
pathType
=
'policy/guangxi/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url_all
=
"""
url_all
=
"""
...
@@ -2519,6 +2614,9 @@ def guang_xi():
...
@@ -2519,6 +2614,9 @@ def guang_xi():
contentWithTag
=
BeautifulSoup
(
str
(
contentWithTag
),
'html.parser'
)
contentWithTag
=
BeautifulSoup
(
str
(
contentWithTag
),
'html.parser'
)
contentWithTag
=
paserUrl
(
contentWithTag
,
href
)
contentWithTag
=
paserUrl
(
contentWithTag
,
href
)
content
=
contentWithTag
.
text
.
strip
()
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
...
@@ -2568,14 +2666,14 @@ def guang_xi():
...
@@ -2568,14 +2666,14 @@ def guang_xi():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 贵州
# 贵州
def
gui_zhou
():
def
gui_zhou
():
...
@@ -2585,6 +2683,7 @@ def gui_zhou():
...
@@ -2585,6 +2683,7 @@ def gui_zhou():
"""
"""
pathType
=
'policy/guizhou/'
pathType
=
'policy/guizhou/'
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
11
):
for
page
in
range
(
0
,
11
):
if
page
==
0
:
if
page
==
0
:
...
@@ -2630,6 +2729,9 @@ def gui_zhou():
...
@@ -2630,6 +2729,9 @@ def gui_zhou():
contentWithTag
=
paserUrl
(
contentWithTag
,
href
)
contentWithTag
=
paserUrl
(
contentWithTag
,
href
)
content
=
contentWithTag
.
text
.
strip
()
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -2678,8 +2780,8 @@ def gui_zhou():
...
@@ -2678,8 +2780,8 @@ def gui_zhou():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
# save_data(result_dict)
count
+=
1
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
...
@@ -2697,6 +2799,7 @@ def yun_nan():
...
@@ -2697,6 +2799,7 @@ def yun_nan():
http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1
http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1
"""
"""
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
6
):
for
page
in
range
(
1
,
6
):
if
page
==
1
:
if
page
==
1
:
...
@@ -2735,6 +2838,9 @@ def yun_nan():
...
@@ -2735,6 +2838,9 @@ def yun_nan():
contentwithTag
=
\
contentwithTag
=
\
doc_href
.
select
(
'#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content'
)[
0
]
doc_href
.
select
(
'#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content'
)[
0
]
content
=
contentwithTag
.
text
content
=
contentwithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentwithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentwithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -2793,18 +2899,20 @@ def yun_nan():
...
@@ -2793,18 +2899,20 @@ def yun_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
num
=
num
+
1
num
=
num
+
1
count
+=
1
except
:
except
:
pass
pass
resp
.
close
()
resp
.
close
()
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
yun_nan2
():
def
yun_nan2
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
4
):
for
page
in
range
(
1
,
4
):
if
page
==
1
:
if
page
==
1
:
...
@@ -2828,7 +2936,7 @@ def yun_nan():
...
@@ -2828,7 +2936,7 @@ def yun_nan():
num
+=
1
num
+=
1
continue
continue
try
:
try
:
print
(
href
)
#
print(href)
if
'.shtml'
in
href
:
if
'.shtml'
in
href
:
res_
=
requests
.
get
(
href
,
headers
)
res_
=
requests
.
get
(
href
,
headers
)
page_text_
=
res_
.
text
.
encode
(
"ISO-8859-1"
)
page_text_
=
res_
.
text
.
encode
(
"ISO-8859-1"
)
...
@@ -2847,6 +2955,9 @@ def yun_nan():
...
@@ -2847,6 +2955,9 @@ def yun_nan():
pub_hao
=
''
pub_hao
=
''
contentwithTag
=
page
.
find
(
'div'
,
attrs
=
{
'class'
:
'zfxxgk-right'
})
contentwithTag
=
page
.
find
(
'div'
,
attrs
=
{
'class'
:
'zfxxgk-right'
})
content
=
contentwithTag
.
text
content
=
contentwithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentwithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentwithTag
.
find_all
(
'a'
)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
try
:
try
:
...
@@ -2857,7 +2968,7 @@ def yun_nan():
...
@@ -2857,7 +2968,7 @@ def yun_nan():
if
'.doc'
in
fu_jian_href
or
'.pdf'
in
fu_jian_href
or
'.xls'
in
fu_jian_href
or
'.zip'
in
fu_jian_href
\
if
'.doc'
in
fu_jian_href
or
'.pdf'
in
fu_jian_href
or
'.xls'
in
fu_jian_href
or
'.zip'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
print
(
fu_jian_href
)
#
print(fu_jian_href)
try
:
try
:
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
...
@@ -2876,9 +2987,7 @@ def yun_nan():
...
@@ -2876,9 +2987,7 @@ def yun_nan():
elif
'display'
in
href
:
elif
'display'
in
href
:
continue
continue
else
:
else
:
content
=
''
continue
contentwithTag
=
''
pub_hao
=
''
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
...
@@ -2907,8 +3016,8 @@ def yun_nan():
...
@@ -2907,8 +3016,8 @@ def yun_nan():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
count
+=
1
num
=
num
+
1
num
=
num
+
1
except
:
except
:
pass
pass
...
@@ -2916,7 +3025,7 @@ def yun_nan():
...
@@ -2916,7 +3025,7 @@ def yun_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
yun_nan1
()
yun_nan1
()
yun_nan2
()
yun_nan2
()
...
@@ -2928,6 +3037,7 @@ def chong_qing():
...
@@ -2928,6 +3037,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
"""
"""
num
=
0
num
=
0
count
=
0
pathType
=
'policy/chongqing/'
pathType
=
'policy/chongqing/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
4
):
for
page
in
range
(
0
,
4
):
...
@@ -2955,7 +3065,7 @@ def chong_qing():
...
@@ -2955,7 +3065,7 @@ def chong_qing():
num
+=
1
num
+=
1
continue
continue
try
:
try
:
print
(
href
)
#
print(href)
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
href_text
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
.
content
href_text
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
.
content
doc_href
=
pq
(
href_text
)
doc_href
=
pq
(
href_text
)
...
@@ -2978,6 +3088,9 @@ def chong_qing():
...
@@ -2978,6 +3088,9 @@ def chong_qing():
pass
pass
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-article'
)
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-article'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
except
:
except
:
origin
=
''
origin
=
''
topicClassification
=
''
topicClassification
=
''
...
@@ -2986,7 +3099,9 @@ def chong_qing():
...
@@ -2986,7 +3099,9 @@ def chong_qing():
pub_hao
=
''
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-content'
)
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
fu_jian_list
=
contentWithTag
.
find_all
(
'a'
)
# print(fu_jian_list)
# print(fu_jian_list)
for
fu_jian
in
fu_jian_list
:
for
fu_jian
in
fu_jian_list
:
...
@@ -3039,21 +3154,22 @@ def chong_qing():
...
@@ -3039,21 +3154,22 @@ def chong_qing():
flag
=
sendKafka
(
dic_news
)
flag
=
sendKafka
(
dic_news
)
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
log
.
info
(
title
)
# save_data(result_dict)
count
+=
1
num
+=
1
num
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 天津
# 天津
def
tian_jin
():
def
tian_jin
():
pathType
=
'policy/tianjin/'
pathType
=
'policy/tianjin/'
def
tian_jin1
():
def
tian_jin1
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
3
):
for
page
in
range
(
0
,
3
):
if
page
==
0
:
if
page
==
0
:
...
@@ -3139,7 +3255,9 @@ def tian_jin():
...
@@ -3139,7 +3255,9 @@ def tian_jin():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -3167,18 +3285,20 @@ def tian_jin():
...
@@ -3167,18 +3285,20 @@ def tian_jin():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
tian_jin2
():
def
tian_jin2
():
"""
"""
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
"""
"""
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
5
):
for
page
in
range
(
0
,
5
):
if
page
==
0
:
if
page
==
0
:
...
@@ -3263,7 +3383,9 @@ def tian_jin():
...
@@ -3263,7 +3383,9 @@ def tian_jin():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -3291,15 +3413,17 @@ def tian_jin():
...
@@ -3291,15 +3413,17 @@ def tian_jin():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
tian_jin3
():
def
tian_jin3
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
3
):
for
page
in
range
(
1
,
3
):
if
page
==
1
:
if
page
==
1
:
...
@@ -3391,7 +3515,9 @@ def tian_jin():
...
@@ -3391,7 +3515,9 @@ def tian_jin():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -3419,12 +3545,13 @@ def tian_jin():
...
@@ -3419,12 +3545,13 @@ def tian_jin():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
tian_jin1
()
tian_jin1
()
tian_jin2
()
tian_jin2
()
...
@@ -3435,6 +3562,7 @@ def xin_jiang():
...
@@ -3435,6 +3562,7 @@ def xin_jiang():
pathType
=
'policy/xinjiang/'
pathType
=
'policy/xinjiang/'
def
xin_jiang1
():
def
xin_jiang1
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
10
):
for
page
in
range
(
1
,
10
):
if
page
==
1
:
if
page
==
1
:
...
@@ -3493,6 +3621,9 @@ def xin_jiang():
...
@@ -3493,6 +3621,9 @@ def xin_jiang():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(新国.{1,}?号)|(国资.{1,}?号)'
pattern
=
r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -3528,15 +3659,17 @@ def xin_jiang():
...
@@ -3528,15 +3659,17 @@ def xin_jiang():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
xin_jiang_jsbt
():
def
xin_jiang_jsbt
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
6
):
for
page
in
range
(
1
,
6
):
if
page
==
1
:
if
page
==
1
:
...
@@ -3592,6 +3725,9 @@ def xin_jiang():
...
@@ -3592,6 +3725,9 @@ def xin_jiang():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(新国.{1,}?号)|(国资.{1,}?号)'
pattern
=
r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -3627,6 +3763,7 @@ def xin_jiang():
...
@@ -3627,6 +3763,7 @@ def xin_jiang():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
href_res
.
close
()
href_res
.
close
()
except
:
except
:
pass
pass
...
@@ -3634,7 +3771,7 @@ def xin_jiang():
...
@@ -3634,7 +3771,7 @@ def xin_jiang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
xin_jiang1
()
xin_jiang1
()
xin_jiang_jsbt
()
xin_jiang_jsbt
()
...
@@ -3643,6 +3780,7 @@ def xin_jiang():
...
@@ -3643,6 +3780,7 @@ def xin_jiang():
def
shan_xi
():
def
shan_xi
():
pathType
=
'policy/shanxi/'
pathType
=
'policy/shanxi/'
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
7
):
for
page
in
range
(
1
,
7
):
if
page
==
1
:
if
page
==
1
:
...
@@ -3712,6 +3850,9 @@ def shan_xi():
...
@@ -3712,6 +3850,9 @@ def shan_xi():
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(晋国资.{1,}?号)|(国资.{1,}?号)'
pattern
=
r'(晋国资.{1,}?号)|(国资.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -3747,17 +3888,19 @@ def shan_xi():
...
@@ -3747,17 +3888,19 @@ def shan_xi():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 辽宁
# 辽宁
def
liao_ning
():
def
liao_ning
():
pathType
=
'policy/liaoning/'
pathType
=
'policy/liaoning/'
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
3
):
for
page
in
range
(
1
,
3
):
url
=
f
'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
url
=
f
'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
...
@@ -3823,6 +3966,9 @@ def liao_ning():
...
@@ -3823,6 +3966,9 @@ def liao_ning():
if
len
(
contentWithTag
)
<
1
:
if
len
(
contentWithTag
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(辽国资.{1,}?号)|(国资.{1,}?号)'
pattern
=
r'(辽国资.{1,}?号)|(国资.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -3858,6 +4004,7 @@ def liao_ning():
...
@@ -3858,6 +4004,7 @@ def liao_ning():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
...
@@ -3869,6 +4016,7 @@ def liao_ning():
...
@@ -3869,6 +4016,7 @@ def liao_ning():
def
hei_long_jiang
():
def
hei_long_jiang
():
pathType
=
'policy/heilongjiang/'
pathType
=
'policy/heilongjiang/'
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
3
):
for
page
in
range
(
1
,
3
):
url
=
f
'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
url
=
f
'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
...
@@ -3926,6 +4074,9 @@ def hei_long_jiang():
...
@@ -3926,6 +4074,9 @@ def hei_long_jiang():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -3953,6 +4104,7 @@ def hei_long_jiang():
...
@@ -3953,6 +4104,7 @@ def hei_long_jiang():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
...
@@ -3960,11 +4112,12 @@ def hei_long_jiang():
...
@@ -3960,11 +4112,12 @@ def hei_long_jiang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 江苏
# 江苏
def
jiang_su
():
def
jiang_su
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/jiangsu/'
pathType
=
'policy/jiangsu/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
pagestart
=
1
pagestart
=
1
...
@@ -4034,6 +4187,9 @@ def jiang_su():
...
@@ -4034,6 +4187,9 @@ def jiang_su():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
if
len
(
pub_hao
)
<
1
:
if
len
(
pub_hao
)
<
1
:
pattern
=
r'(苏国.{1,}?号)|(国.{1,}?号)'
pattern
=
r'(苏国.{1,}?号)|(国.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
...
@@ -4068,18 +4224,20 @@ def jiang_su():
...
@@ -4068,18 +4224,20 @@ def jiang_su():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 安徽
# 安徽
def
an_hui
():
def
an_hui
():
pathType
=
'policy/anhui/'
pathType
=
'policy/anhui/'
def
an_hui1
():
def
an_hui1
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
4
):
for
page
in
range
(
1
,
4
):
url
=
f
'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy
%
E5
%
B9
%
B4MM
%
E6
%9
C
%88
dd
%
E6
%97%
A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=
%2
Fxxgk
%2
FpublicInfoList_newest2020_zc'
url
=
f
'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy
%
E5
%
B9
%
B4MM
%
E6
%9
C
%88
dd
%
E6
%97%
A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=
%2
Fxxgk
%2
FpublicInfoList_newest2020_zc'
...
@@ -4137,6 +4295,9 @@ def an_hui():
...
@@ -4137,6 +4295,9 @@ def an_hui():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -4164,15 +4325,17 @@ def an_hui():
...
@@ -4164,15 +4325,17 @@ def an_hui():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
an_hui2
():
def
an_hui2
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
25
):
for
page
in
range
(
1
,
25
):
url
=
f
'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy
%
E5
%
B9
%
B4MM
%
E6
%9
C
%88
dd
%
E6
%97%
A5&length=15&organId=7031&type=4&catIds=43793891
%2
C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=
%2
Fxxgk
%2
FpublicInfoList_newest2020_zc'
url
=
f
'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy
%
E5
%
B9
%
B4MM
%
E6
%9
C
%88
dd
%
E6
%97%
A5&length=15&organId=7031&type=4&catIds=43793891
%2
C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=
%2
Fxxgk
%2
FpublicInfoList_newest2020_zc'
...
@@ -4233,6 +4396,9 @@ def an_hui():
...
@@ -4233,6 +4396,9 @@ def an_hui():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -4260,6 +4426,7 @@ def an_hui():
...
@@ -4260,6 +4426,7 @@ def an_hui():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
href_res
.
close
()
href_res
.
close
()
except
:
except
:
pass
pass
...
@@ -4267,7 +4434,7 @@ def an_hui():
...
@@ -4267,7 +4434,7 @@ def an_hui():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
an_hui1
()
an_hui1
()
an_hui2
()
an_hui2
()
...
@@ -4280,6 +4447,7 @@ def jiang_xi():
...
@@ -4280,6 +4447,7 @@ def jiang_xi():
121-164
121-164
"""
"""
num
=
0
num
=
0
count
=
0
pathType
=
'policy/jiangxi/'
pathType
=
'policy/jiangxi/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
startrecord
=
1
startrecord
=
1
...
@@ -4360,6 +4528,9 @@ def jiang_xi():
...
@@ -4360,6 +4528,9 @@ def jiang_xi():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
if
len
(
pub_hao
)
<
1
:
if
len
(
pub_hao
)
<
1
:
pattern
=
r'(赣国资.{1,}?号)|(国.{1,}?号)'
pattern
=
r'(赣国资.{1,}?号)|(国.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
...
@@ -4395,16 +4566,18 @@ def jiang_xi():
...
@@ -4395,16 +4566,18 @@ def jiang_xi():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 河南
# 河南
def
he_nan
():
def
he_nan
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/henan/'
pathType
=
'policy/henan/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
7
):
for
page
in
range
(
0
,
7
):
...
@@ -4456,6 +4629,9 @@ def he_nan():
...
@@ -4456,6 +4629,9 @@ def he_nan():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(豫国.{1,}?号)|(国.{1,}?号)'
pattern
=
r'(豫国.{1,}?号)|(国.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -4489,16 +4665,18 @@ def he_nan():
...
@@ -4489,16 +4665,18 @@ def he_nan():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
href_res
.
close
()
href_res
.
close
()
resp_text
.
close
()
resp_text
.
close
()
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 湖南
# 湖南
def
hu_nan
():
def
hu_nan
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/hunan/'
pathType
=
'policy/hunan/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
1
,
7
):
for
page
in
range
(
1
,
7
):
...
@@ -4565,6 +4743,9 @@ def hu_nan():
...
@@ -4565,6 +4743,9 @@ def hu_nan():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -4592,18 +4773,20 @@ def hu_nan():
...
@@ -4592,18 +4773,20 @@ def hu_nan():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 甘肃
# 甘肃
def
gan_su
():
def
gan_su
():
pathType
=
'policy/gansu/'
pathType
=
'policy/gansu/'
def
gan_su1
():
def
gan_su1
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
bro
=
getDriver
()
bro
=
getDriver
()
urls
=
[
'http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml'
,
urls
=
[
'http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml'
,
...
@@ -4686,6 +4869,9 @@ def gan_su():
...
@@ -4686,6 +4869,9 @@ def gan_su():
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -4715,6 +4901,7 @@ def gan_su():
...
@@ -4715,6 +4901,7 @@ def gan_su():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
pass
pass
...
@@ -4724,6 +4911,7 @@ def gan_su():
...
@@ -4724,6 +4911,7 @@ def gan_su():
def
gan_su2
():
def
gan_su2
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
bro
=
getDriver
()
bro
=
getDriver
()
url
=
'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
url
=
'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
...
@@ -4821,6 +5009,9 @@ def gan_su():
...
@@ -4821,6 +5009,9 @@ def gan_su():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
if
len
(
content
)
<
2
:
if
len
(
content
)
<
2
:
continue
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# t = time.strptime(publishDate, "%Y年%m月%d日")
...
@@ -4852,6 +5043,7 @@ def gan_su():
...
@@ -4852,6 +5043,7 @@ def gan_su():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -4859,10 +5051,11 @@ def gan_su():
...
@@ -4859,10 +5051,11 @@ def gan_su():
pass
pass
bro
.
quit
()
bro
.
quit
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
gan_su3
():
def
gan_su3
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
# chrome_options = webdriver.ChromeOptions()
...
@@ -4979,6 +5172,9 @@ def gan_su():
...
@@ -4979,6 +5172,9 @@ def gan_su():
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
if
len
(
content
)
<
2
:
if
len
(
content
)
<
2
:
continue
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# t = time.strptime(publishDate, "%Y年%m月%d日")
...
@@ -5010,13 +5206,14 @@ def gan_su():
...
@@ -5010,13 +5206,14 @@ def gan_su():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
except
:
except
:
pass
pass
bro
.
quit
()
bro
.
quit
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
gan_su1
()
gan_su1
()
gan_su2
()
gan_su2
()
...
@@ -5025,6 +5222,7 @@ def gan_su():
...
@@ -5025,6 +5222,7 @@ def gan_su():
# 宁夏
# 宁夏
def
ning_xia
():
def
ning_xia
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/ningxia/'
pathType
=
'policy/ningxia/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
page
in
range
(
0
,
3
):
for
page
in
range
(
0
,
3
):
...
@@ -5082,6 +5280,9 @@ def ning_xia():
...
@@ -5082,6 +5280,9 @@ def ning_xia():
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
t
=
time
.
strptime
(
publishDate
,
"
%
Y年
%
m月
%
d日"
)
t
=
time
.
strptime
(
publishDate
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
t
)
publishDate
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
t
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -5111,16 +5312,18 @@ def ning_xia():
...
@@ -5111,16 +5312,18 @@ def ning_xia():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 陕西
# 陕西
def
shanxi
():
def
shanxi
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/shan_xi/'
pathType
=
'policy/shan_xi/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url
=
'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
url
=
'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
...
@@ -5184,6 +5387,9 @@ def shanxi():
...
@@ -5184,6 +5387,9 @@ def shanxi():
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -5211,6 +5417,7 @@ def shanxi():
...
@@ -5211,6 +5417,7 @@ def shanxi():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
res_href
.
close
()
res_href
.
close
()
except
:
except
:
pass
pass
...
@@ -5218,7 +5425,7 @@ def shanxi():
...
@@ -5218,7 +5425,7 @@ def shanxi():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 西藏
# 西藏
def
xi_zang
():
def
xi_zang
():
...
@@ -5228,6 +5435,7 @@ def xi_zang():
...
@@ -5228,6 +5435,7 @@ def xi_zang():
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml'
,
]
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml'
,
]
for
url
in
url_list
:
for
url
in
url_list
:
num
=
0
num
=
0
count
=
0
try
:
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res
.
encoding
=
res
.
apparent_encoding
...
@@ -5256,6 +5464,9 @@ def xi_zang():
...
@@ -5256,6 +5464,9 @@ def xi_zang():
contentWithTag
=
str
(
i_soup
.
find
(
id
=
'NewsContent'
))
contentWithTag
=
str
(
i_soup
.
find
(
id
=
'NewsContent'
))
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
fu_jian_soup
=
soup
.
find_all
(
'a'
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
id_list
=
[]
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
...
@@ -5306,18 +5517,20 @@ def xi_zang():
...
@@ -5306,18 +5517,20 @@ def xi_zang():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 青海
# 青海
def
qing_hai
():
def
qing_hai
():
pathType
=
'policy/qinghai/'
pathType
=
'policy/qinghai/'
def
qing_hai1
():
def
qing_hai1
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url_mode
=
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
url_mode
=
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
try
:
try
:
...
@@ -5353,6 +5566,9 @@ def qing_hai():
...
@@ -5353,6 +5566,9 @@ def qing_hai():
origin
=
str
(
page
.
find
(
'div'
,
attrs
=
{
'class'
:
'foot-fb'
}))
origin
=
str
(
page
.
find
(
'div'
,
attrs
=
{
'class'
:
'foot-fb'
}))
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{durl}----{title}----内容为空-----'
)
continue
fu_jian_soup
=
soup
.
find_all
(
'a'
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
id_list
=
[]
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
...
@@ -5364,7 +5580,7 @@ def qing_hai():
...
@@ -5364,7 +5580,7 @@ def qing_hai():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1681'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -5405,15 +5621,17 @@ def qing_hai():
...
@@ -5405,15 +5621,17 @@ def qing_hai():
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
def
qing_hai2
():
def
qing_hai2
():
num
=
0
num
=
0
count
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
urls
=
[
urls
=
[
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627'
,
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627'
,
...
@@ -5446,6 +5664,7 @@ def qing_hai():
...
@@ -5446,6 +5664,7 @@ def qing_hai():
durl
=
tr
.
find
(
'a'
)
.
get
(
'href'
)
durl
=
tr
.
find
(
'a'
)
.
get
(
'href'
)
is_href
=
db_storage
.
find_one
({
'网址'
:
durl
})
is_href
=
db_storage
.
find_one
({
'网址'
:
durl
})
if
is_href
:
if
is_href
:
num
+=
1
log
.
info
(
'已采集----------跳过'
)
log
.
info
(
'已采集----------跳过'
)
continue
continue
title
=
tr
.
find
(
'a'
)
.
text
title
=
tr
.
find
(
'a'
)
.
text
...
@@ -5471,6 +5690,9 @@ def qing_hai():
...
@@ -5471,6 +5690,9 @@ def qing_hai():
origin
=
''
origin
=
''
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
soup
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{durl}----{title}----内容为空-----'
)
continue
fu_jian_soup
=
soup
.
find_all
(
'a'
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
id_list
=
[]
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
...
@@ -5482,7 +5704,7 @@ def qing_hai():
...
@@ -5482,7 +5704,7 @@ def qing_hai():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1681'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -5490,7 +5712,7 @@ def qing_hai():
...
@@ -5490,7 +5712,7 @@ def qing_hai():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'青海省国资委'
,
file_name
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'青海省国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
# todo:替换完成之后,将附件上传至文件服务器
# todo:替换完成之后,将附件上传至文件服务器
...
@@ -5523,13 +5745,14 @@ def qing_hai():
...
@@ -5523,13 +5745,14 @@ def qing_hai():
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
res
.
close
()
res
.
close
()
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
qing_hai1
()
qing_hai1
()
qing_hai2
()
qing_hai2
()
...
@@ -5537,6 +5760,8 @@ def qing_hai():
...
@@ -5537,6 +5760,8 @@ def qing_hai():
# 河北
# 河北
def
he_bei
():
def
he_bei
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/hebei/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
url
=
'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
url
=
'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
try
:
try
:
...
@@ -5551,6 +5776,7 @@ def he_bei():
...
@@ -5551,6 +5776,7 @@ def he_bei():
href
=
'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id='
+
str
(
id
)
href
=
'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id='
+
str
(
id
)
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
num
+=
1
continue
continue
pub_time_
=
info
[
'updated'
]
pub_time_
=
info
[
'updated'
]
m
=
round
(
pub_time_
/
1000
)
# 四舍五入取10位时间戳(秒级)
m
=
round
(
pub_time_
/
1000
)
# 四舍五入取10位时间戳(秒级)
...
@@ -5569,7 +5795,7 @@ def he_bei():
...
@@ -5569,7 +5795,7 @@ def he_bei():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1668'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1668'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -5577,13 +5803,16 @@ def he_bei():
...
@@ -5577,13 +5803,16 @@ def he_bei():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'河北省国资委'
,
file_name
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'河北省国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
if
len
(
contentWithTag
)
<
1
:
if
len
(
contentWithTag
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
pattern
=
r'(冀国.{1,}?号)|(国资.{1,}?号)'
pattern
=
r'(冀国.{1,}?号)|(国资.{1,}?号)'
match_list
=
re
.
findall
(
pattern
,
content
)
match_list
=
re
.
findall
(
pattern
,
content
)
if
len
(
match_list
)
>
0
:
if
len
(
match_list
)
>
0
:
...
@@ -5619,14 +5848,17 @@ def he_bei():
...
@@ -5619,14 +5848,17 @@ def he_bei():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
# 湖北
# 湖北
def
hu_bei
():
def
hu_bei
():
num
=
0
num
=
0
count
=
0
pathType
=
'policy/hubei/'
start_time
=
time
.
time
()
start_time
=
time
.
time
()
hrefs
=
[]
hrefs
=
[]
url
=
'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
url
=
'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
...
@@ -5649,6 +5881,7 @@ def hu_bei():
...
@@ -5649,6 +5881,7 @@ def hu_bei():
for
href
in
hrefs
:
for
href
in
hrefs
:
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
if
is_href
:
num
+=
1
continue
continue
try
:
try
:
driver
.
get
(
href
)
driver
.
get
(
href
)
...
@@ -5684,7 +5917,7 @@ def hu_bei():
...
@@ -5684,7 +5917,7 @@ def hu_bei():
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
retData
=
baseCore
.
up
loadToserver
(
file_href
,
'1675'
)
retData
=
baseCore
.
up
toOBS
(
file_href
,
'1675'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -5692,14 +5925,16 @@ def hu_bei():
...
@@ -5692,14 +5925,16 @@ def hu_bei():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'湖北省国资委'
,
file_name
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'湖北省国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
full_path
# id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
contentWithTag
=
str
(
soup
.
prettify
())
if
len
(
contentWithTag
)
<
1
:
if
len
(
contentWithTag
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
continue
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
...
@@ -5727,48 +5962,49 @@ def hu_bei():
...
@@ -5727,48 +5962,49 @@ def hu_bei():
if
flag
:
if
flag
:
save_data
(
dic_news
)
save_data
(
dic_news
)
num
+=
1
num
+=
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
pass
pass
driver
.
close
()
driver
.
close
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{
num
}条数据,共耗时{end_time - start_time}'
)
print
(
f
'共抓取{
count
}条数据,共耗时{end_time - start_time}'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#
get_content1()
get_content1
()
#
get_content2()
get_content2
()
#
get_content3()
get_content3
()
#
bei_jing()
bei_jing
()
#
nei_meng_gu()
nei_meng_gu
()
ji_lin
()
ji_lin
()
#
shang_hai()
shang_hai
()
#
zhe_jiang()
zhe_jiang
()
#
fu_jian()
fu_jian
()
#
shan_dong()
shan_dong
()
#
guang_dong()
guang_dong
()
#
hai_nan()
hai_nan
()
#
si_chuan()
si_chuan
()
#
guang_xi()
guang_xi
()
#
gui_zhou()
gui_zhou
()
#
yun_nan()
yun_nan
()
#
chong_qing()
chong_qing
()
#
tian_jin()
tian_jin
()
#
xin_jiang()
xin_jiang
()
#
shan_xi()
shan_xi
()
#
liao_ning()
liao_ning
()
#
hei_long_jiang()
hei_long_jiang
()
#
jiang_su()
jiang_su
()
#
an_hui()
an_hui
()
#
jiang_xi()
jiang_xi
()
#
he_nan()
he_nan
()
#
hu_nan()
hu_nan
()
#
gan_su()
gan_su
()
#
ning_xia()
ning_xia
()
#
xi_zang()
xi_zang
()
#
shanxi()
shanxi
()
#
qing_hai()
qing_hai
()
#
he_bei()
he_bei
()
#
qing_hai()
qing_hai
()
#
current_time = datetime.datetime.now()
current_time
=
datetime
.
datetime
.
now
()
#
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
midnight_time
=
current_time
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
+
datetime
.
timedelta
(
days
=
1
)
#
sleep_seconds = (midnight_time - current_time).total_seconds()
sleep_seconds
=
(
midnight_time
-
current_time
)
.
total_seconds
()
#
time.sleep(sleep_seconds)
time
.
sleep
(
sleep_seconds
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论