Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
a5ca358d
提交
a5ca358d
authored
10月 21, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
a512c71f
3fab40ce
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
620 行增加
和
576 行删除
+620
-576
policy.py
comData/policylaw/policy.py
+375
-365
tingtype.py
comData/policylaw/tingtype.py
+245
-211
没有找到文件。
comData/policylaw/policy.py
浏览文件 @
a5ca358d
# _*_ coding:utf-8 _*_
"""数据全量跑一遍,不做判重逻辑"""
import
datetime
import
json
import
os
import
re
import
time
import
datetime
import
fitz
import
pymongo
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
pyquery
import
PyQuery
as
pq
from
requests.packages
import
urllib3
from
requests.adapters
import
HTTPAdapter
from
urllib.parse
import
urljoin
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
urllib3
.
disable_warnings
()
...
...
@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
from
selenium.webdriver.common.by
import
By
from
lxml
import
etree
from
random
import
choice
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
requests.adapters
import
HTTPAdapter
log
=
baseCore
.
getLogger
()
taskType
=
'政策法规'
...
...
@@ -36,11 +34,10 @@ taskType = '政策法规'
各地方国资委
"""
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
driver_path
=
r'F:\spider
\cmd100\chromedriver.exe'
chromr_bin
=
r'F:\spider
\Google\Chrome\Application\chrome.exe'
driver_path
=
r'D:
\cmd100\chromedriver.exe'
chromr_bin
=
r'D:
\Google\Chrome\Application\chrome.exe'
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
...
...
@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
def
getDriver
():
service
=
Service
(
driver_path
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
add_argument
(
'--headless'
)
#
chrome_options.add_argument('--headless')
chrome_options
.
add_argument
(
'--disable-gpu'
)
# chrome_options.add_argument('--no-sandbox')
chrome_options
.
add_argument
(
'log-level=3'
)
chrome_options
.
add_argument
(
'--disable-dev-shm-usage'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 屏蔽chrome自动化受控提示
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
# 禁用启用Blink运行时的功能去掉webdriver痕迹
...
...
@@ -77,6 +75,12 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
)
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
executable_path
=
driver_path
)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return
bro
def
save_data
(
dic_news
):
...
...
@@ -203,109 +207,111 @@ def get_content1():
s
.
keep_alive
=
False
pcodeJiguan
=
a_list
[
0
]
try
:
pageCount
=
getPageConunt
(
a_list
,
url
,
headers
,
s
)
for
pageNo
in
range
(
1
,
pageCount
+
1
):
#pageCount = getPageConunt(a_list, url, headers, s)
#for pageNo in range(1, pageCount + 1):
pageNo
=
1
try
:
try
:
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
except
:
s
.
close
()
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
for
page
in
page_list
:
id_list
=
[]
# 获取所需信息
title
=
page
[
'maintitle'
]
# 标题
pub_time1
=
page
[
'publish_time'
]
# 发布时间
pub_time2
=
page
[
'cwrq'
]
# 成文时间
pub_code
=
page
[
'fwzh'
]
# 发文字号
href
=
page
[
'pub_url'
]
# 网址
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
log
.
info
(
'已采集----------跳过'
)
time
.
sleep
(
0.5
)
continue
try
:
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
except
:
s
.
close
()
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
for
page
in
page_list
:
id_list
=
[]
# 获取所需信息
title
=
page
[
'maintitle'
]
# 标题
pub_time1
=
page
[
'publish_time'
]
# 发布时间
pub_time2
=
page
[
'cwrq'
]
# 成文时间
pub_code
=
page
[
'fwzh'
]
# 发文字号
href
=
page
[
'pub_url'
]
# 网址
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
log
.
info
(
'已采集----------跳过'
)
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
i_html
=
resp_href
.
text
if
'您访问的页面不存在或已删除'
in
i_html
:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
.
encoding
=
resp_href
.
apparent_encoding
i_html
=
resp_href
.
text
if
'您访问的页面不存在或已删除'
in
i_html
:
# log.error(f'{title}...{href}...页面不存在或已删除')
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
paserUrl
(
i_soup
,
href
)
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 发文机关
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 主题分类
contentWithTag
=
i_soup
.
find
(
'div'
,
class_
=
'wrap mxxgkwrap mxxgkwrap_gwywj'
)
.
find
(
'table'
,
class_
=
'border-table noneBorder pages_content'
)
# 去除扫一扫
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
content
=
contentWithTag
.
text
# 不带标签正文
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
time
.
sleep
(
0.5
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
paserUrl
(
i_soup
,
href
)
source
=
str
(
i_soup
.
find_all
(
'tbody'
)[
0
])
pub_org
=
source
.
split
(
'<td><b>发文机关:</b></td>'
)[
1
]
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 发文机关
child_type
=
source
.
split
(
'<td class="w340 zcwj_ztfl">'
)[
1
]
.
split
(
'</td>'
)[
0
]
# 主题分类
contentWithTag
=
i_soup
.
find
(
'div'
,
class_
=
'wrap mxxgkwrap mxxgkwrap_gwywj'
)
.
find
(
'table'
,
class_
=
'border-table noneBorder pages_content'
)
# 去除扫一扫
contentWithTag
.
find
(
'div'
,
attrs
=
{
'id'
:
'div_div'
})
.
decompose
()
content
=
contentWithTag
.
text
# 不带标签正文
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
time
.
sleep
(
0.5
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1766'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1766'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
,
pub_time1
)
id_list
.
append
(
att_id
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
,
pub_time1
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
full_path
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
log
.
error
(
f
'{pcodeJiguan}...第{pageNo}页获取列表失败'
)
continue
#todo:将返回的地址更新到soup
file
[
'href'
]
=
full_path
except
:
log
.
error
(
f
'{title}...{href}...获取内容失败'
)
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
num
+=
1
except
:
log
.
error
(
f
'{pcodeJiguan}...第{pageNo}页获取列表失败'
)
continue
except
:
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
continue
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院文件{num}条数据,共耗时{start_time - end
_time}'
)
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院文件{num}条数据,共耗时{end_time-start
_time}'
)
# 国务院部门文件
def
get_content2
():
...
...
@@ -355,114 +361,117 @@ def get_content2():
'国家知识产权局'
,
'国家档案局'
,
'国家保密局'
,
'国家密码管理局'
,
'国家宗教事务局'
,
'国务院台湾事务办公室'
,
'国家乡村振兴局'
,
'国家电影局'
]
for
bmfl
in
result_list
:
#try:
#totalpage = getTotalpage(bmfl,headers,session)
#for pageNo in range(1,totalpage+1):
#for pageNo in range(1,6):
pageNo
=
1
try
:
totalpage
=
getTotalpage
(
bmfl
,
headers
,
session
)
for
pageNo
in
range
(
1
,
totalpage
+
1
):
try
:
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
except
:
session
.
close
()
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
for
content_dict
in
content_list
:
id_list
=
[]
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
try
:
try
:
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
except
:
session
.
close
()
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
for
content_dict
in
content_list
:
id_list
=
[]
href
=
content_dict
[
'url'
]
# 详情页
title
=
content_dict
[
'title'
]
# 标题
pub_code
=
content_dict
[
'pcode'
]
# 发文字号
try
:
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
pub_time1
=
''
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
pub_time2
=
''
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
pub_time1
=
None
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
pub_time2
=
None
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
child_type
=
''
# # 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
log
.
info
(
'已采集----------跳过'
)
time
.
sleep
(
0.5
)
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
time
.
sleep
(
0.5
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
})
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'----{href}---{title}---内容为空---'
)
continue
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
child_type
=
''
# # 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
log
.
info
(
'已采集----------跳过'
)
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
soup
=
BeautifulSoup
(
resp_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
time
.
sleep
(
0.5
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'class'
:
'pages_content mhide'
}
)
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'----{href}---{title}---内容为空---'
)
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1699'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
fu_jian_soup
=
contentWithTag
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1699'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
,
pub_time1
)
id_list
.
append
(
att_id
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
file_name
,
num
,
pub_time1
)
id_list
.
append
(
att_id
)
#todo:将返回的地址更新到soup
file
[
'href'
]
=
full_path
except
:
log
.
error
(
f
'{title}...{href}获取内容失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
count
+=
1
num
+=
1
#todo:将返回的地址更新到soup
file
[
'href'
]
=
full_path
except
:
log
.
error
(
f
'{
bmfl}...第{pageNo}页获取信息列表
失败'
)
log
.
error
(
f
'{
title}...{href}获取内容
失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
#附件id
'author'
:
''
,
#作者
'content'
:
content
,
#正文不带标签
'contentWithTag'
:
str
(
contentWithTag
),
#正文带标签
'createDate'
:
time_now
,
#创建时间
'deleteFlag'
:
0
,
#是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
#关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
#政策发布机关
'organ'
:
pub_org
,
#政策发文机关
'topicClassification'
:
child_type
,
#政策文件分类
'issuedNumber'
:
pub_code
,
#发文字号
'publishDate'
:
pub_time1
,
#发布时间
'writtenDate'
:
pub_time2
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'title'
:
title
#标题
}
# print(dic_news)
flag
=
sendKafka
(
dic_news
)
if
flag
:
save_data
(
dic_news
)
count
+=
1
num
+=
1
except
:
log
.
error
(
f
'{bmfl}...
获取页数
失败'
)
log
.
error
(
f
'{bmfl}...
第{pageNo}页获取信息列表
失败'
)
continue
#except:
# log.error(f'{bmfl}...获取页数失败')
# continue
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}'
)
...
...
@@ -553,7 +562,7 @@ def get_content3():
'topicClassification'
:
''
,
#政策文件分类
'issuedNumber'
:
pub_hao
,
#发文字号
'publishDate'
:
pub_time
,
#发布时间
'writtenDate'
:
''
,
#成文时间
'writtenDate'
:
None
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
...
...
@@ -744,7 +753,7 @@ def bei_jing():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1667'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -870,7 +879,7 @@ def nei_meng_gu():
fu_jian_re
=
str
(
real_href
)
.
split
(
'/t'
)[
0
]
+
'/'
+
str
(
fu_jian_re
)
.
split
(
'./'
)[
1
]
fu_jian_href
=
fu_jian_re
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
title
:
if
category
not
in
title
:
file_name
=
title
+
category
# print(fu_jian_href)
# todo:附件上传至文件服务器
...
...
@@ -918,7 +927,7 @@ def nei_meng_gu():
pass
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 吉林
def
ji_lin
():
...
...
@@ -982,7 +991,7 @@ def ji_lin():
# print(pub_come)
i_content
=
soup
.
find
(
class_
=
'zsy_comain'
)
if
i_content
:
print
(
real_href
)
#
print(real_href)
# 去掉扫一扫
try
:
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
...
...
@@ -1020,7 +1029,7 @@ def ji_lin():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
file_name
=
fu_jian_href
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# print(fu_jian_href)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1670'
,
pathType
,
file_name
)
...
...
@@ -1065,7 +1074,7 @@ def ji_lin():
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
# print(fj_href)
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1670'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -1104,7 +1113,7 @@ def ji_lin():
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
...
...
@@ -1126,7 +1135,7 @@ def ji_lin():
except
:
pass
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 上海
def
shang_hai
():
...
...
@@ -1219,7 +1228,7 @@ def shang_hai():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1671'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -1252,7 +1261,7 @@ def shang_hai():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -1268,7 +1277,7 @@ def shang_hai():
except
:
pass
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 浙江
def
zhe_jiang
():
...
...
@@ -1376,7 +1385,7 @@ def zhe_jiang():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -1393,7 +1402,7 @@ def zhe_jiang():
except
:
pass
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 福建
def
fu_jian
():
...
...
@@ -1445,7 +1454,7 @@ def fu_jian():
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
real_href
=
href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print
(
real_href
)
#
print(real_href)
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
num
+=
1
...
...
@@ -1460,7 +1469,7 @@ def fu_jian():
content
=
baseCore
.
pdf_content
(
resp_content
)
contentwithtag
=
''
category
=
os
.
path
.
splitext
(
real_href
)[
1
]
if
category
not
in
title
:
if
category
not
in
title
:
file_name
=
title
+
category
# 文件上传至服务器
retData
=
baseCore
.
uptoOBS
(
real_href
,
'1673'
,
pathType
,
file_name
)
...
...
@@ -1471,7 +1480,7 @@ def fu_jian():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'福建省国资委'
,
file_name
,
num
,
''
)
id_list
.
append
(
att_id
)
pub_hao
=
''
pub_time
=
''
pub_time
=
None
pub_source
=
''
else
:
...
...
@@ -1508,7 +1517,7 @@ def fu_jian():
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
print
(
fj_href
)
# 找到附件后 上传至文件服务器
...
...
@@ -1524,7 +1533,7 @@ def fu_jian():
except
:
pub_source
=
''
pub_time
=
''
pub_time
=
None
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
content
=
contentwithtag
.
text
.
strip
()
if
content
==
''
or
content
==
None
:
...
...
@@ -1548,7 +1557,7 @@ def fu_jian():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
...
...
@@ -1566,7 +1575,7 @@ def fu_jian():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 山东
def
shan_dong
():
...
...
@@ -1633,7 +1642,7 @@ def shan_dong():
for
h1
in
h1_list
:
title
=
title
+
str
(
h1
.
text
)
title
.
strip
()
.
lstrip
()
pub_time
=
''
pub_time
=
None
span_list
=
source
.
find_all
(
'span'
)
i
=
0
for
span
in
span_list
:
...
...
@@ -1683,7 +1692,7 @@ def shan_dong():
except
:
pass
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 广东
def
guang_dong
():
...
...
@@ -1745,7 +1754,7 @@ def guang_dong():
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.xlsx'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1676'
,
pathType
,
file_name
)
...
...
@@ -1774,7 +1783,7 @@ def guang_dong():
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -1792,7 +1801,7 @@ def guang_dong():
except
:
pass
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 海南
def
hai_nan
():
...
...
@@ -1869,7 +1878,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
...
@@ -1916,7 +1925,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器
...
...
@@ -1995,7 +2004,7 @@ def hai_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
hai_nan2
():
def
hai_nan_sw
(
page_href
):
...
...
@@ -2126,7 +2135,7 @@ def hai_nan():
pub_source
=
''
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_hao
=
''
writtenDate
=
''
writtenDate
=
None
,
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
...
...
@@ -2143,7 +2152,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
...
@@ -2241,7 +2250,7 @@ def hai_nan():
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_hao
=
''
pub_source
=
''
writtenDate
=
''
writtenDate
=
None
,
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
...
...
@@ -2259,7 +2268,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
...
@@ -2360,7 +2369,7 @@ def hai_nan():
0
]
.
strip
()
except
:
pub_source
=
''
pub_time
=
''
pub_time
=
None
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
class_
=
'pages_content'
)
content
=
contentWithTag
.
text
...
...
@@ -2383,7 +2392,7 @@ def hai_nan():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
i_href
,
'summary'
:
''
,
...
...
@@ -2479,7 +2488,7 @@ def hai_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
start
()
hai_nan1
()
...
...
@@ -2538,7 +2547,7 @@ def si_chuan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 对附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1678'
,
pathType
,
file_name
)
...
...
@@ -2567,7 +2576,7 @@ def si_chuan():
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -2585,7 +2594,7 @@ def si_chuan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 广西
def
guang_xi
():
...
...
@@ -2671,7 +2680,7 @@ def guang_xi():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1692'
,
pathType
,
file_name
)
...
...
@@ -2701,7 +2710,7 @@ def guang_xi():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -2718,7 +2727,7 @@ def guang_xi():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 贵州
def
gui_zhou
():
...
...
@@ -2788,7 +2797,7 @@ def gui_zhou():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1694'
,
pathType
,
file_name
)
...
...
@@ -2818,7 +2827,7 @@ def gui_zhou():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -2836,7 +2845,7 @@ def gui_zhou():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
# 云南
def
yun_nan
():
...
...
@@ -2870,7 +2879,7 @@ def yun_nan():
continue
try
:
fu_jian_href_list
=
[]
print
(
href
)
#
print(href)
if
'.shtml'
in
href
:
href_resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
href_resp
.
encoding
=
href_resp
.
apparent_encoding
...
...
@@ -2901,7 +2910,7 @@ def yun_nan():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
...
...
@@ -2939,8 +2948,8 @@ def yun_nan():
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
''
,
'writtenDate'
:
''
,
'publishDate'
:
None
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -2959,7 +2968,7 @@ def yun_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
yun_nan2
():
num
=
0
...
...
@@ -3022,7 +3031,7 @@ def yun_nan():
# print(fu_jian_href)
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
...
...
@@ -3060,7 +3069,7 @@ def yun_nan():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -3079,7 +3088,7 @@ def yun_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
yun_nan1
()
yun_nan2
()
...
...
@@ -3148,8 +3157,8 @@ def chong_qing():
except
:
origin
=
''
topicClassification
=
''
pub_time
=
''
writtenDate
=
''
pub_time
=
None
writtenDate
=
None
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-content'
)
content
=
contentWithTag
.
text
...
...
@@ -3169,7 +3178,7 @@ def chong_qing():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1693'
,
pathType
,
file_name
)
...
...
@@ -3219,7 +3228,7 @@ def chong_qing():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 天津
def
tian_jin
():
...
...
@@ -3282,7 +3291,7 @@ def tian_jin():
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
...
@@ -3298,7 +3307,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3351,7 +3360,7 @@ def tian_jin():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin2
():
"""
...
...
@@ -3413,7 +3422,7 @@ def tian_jin():
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
...
@@ -3429,7 +3438,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3482,7 +3491,7 @@ def tian_jin():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin3
():
num
=
0
...
...
@@ -3507,7 +3516,7 @@ def tian_jin():
try
:
publishDate
=
li
.
find
(
'div'
,
attrs
=
{
'class'
:
'other'
})
.
text
except
:
publishDate
=
''
publishDate
=
None
if
'http'
not
in
href
:
if
'../../../'
in
href
:
href
=
href
.
replace
(
'../../../'
,
'https://sasac.tj.gov.cn/'
)
...
...
@@ -3548,7 +3557,7 @@ def tian_jin():
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
...
@@ -3564,7 +3573,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3617,7 +3626,7 @@ def tian_jin():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
tian_jin1
()
tian_jin2
()
...
...
@@ -3673,7 +3682,7 @@ def xin_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3717,7 +3726,7 @@ def xin_jiang():
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -3734,7 +3743,7 @@ def xin_jiang():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
xin_jiang_jsbt
():
num
=
0
...
...
@@ -3780,7 +3789,7 @@ def xin_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3824,7 +3833,7 @@ def xin_jiang():
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -3843,7 +3852,7 @@ def xin_jiang():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
xin_jiang1
()
xin_jiang_jsbt
()
...
...
@@ -3881,7 +3890,7 @@ def shan_xi():
try
:
if
".pdf"
in
href
:
content
=
''
publishDate
=
''
publishDate
=
None
origin
=
''
fu_jian_soup
=
[
href
]
contentWithTag
=
''
...
...
@@ -3908,7 +3917,7 @@ def shan_xi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1684'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -3952,7 +3961,7 @@ def shan_xi():
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -3969,7 +3978,7 @@ def shan_xi():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 辽宁
def
liao_ning
():
...
...
@@ -4028,7 +4037,7 @@ def liao_ning():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1685'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4071,7 +4080,7 @@ def liao_ning():
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -4088,7 +4097,7 @@ def liao_ning():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
# 黑龙江
def
hei_long_jiang
():
...
...
@@ -4141,7 +4150,7 @@ def hei_long_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4174,7 +4183,7 @@ def hei_long_jiang():
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
publishDate
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -4193,7 +4202,7 @@ def hei_long_jiang():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 江苏
def
jiang_su
():
...
...
@@ -4257,7 +4266,7 @@ def jiang_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4314,7 +4323,7 @@ def jiang_su():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 安徽
def
an_hui
():
...
...
@@ -4368,7 +4377,7 @@ def an_hui():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4418,7 +4427,7 @@ def an_hui():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
an_hui2
():
num
=
0
...
...
@@ -4472,7 +4481,7 @@ def an_hui():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4524,7 +4533,7 @@ def an_hui():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
an_hui1
()
an_hui2
()
...
...
@@ -4607,7 +4616,7 @@ def jiang_xi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1689'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4647,7 +4656,7 @@ def jiang_xi():
'organ'
:
organ
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
''
,
'publishDate'
:
None
,
'writtenDate'
:
writtenDate
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
...
...
@@ -4665,7 +4674,7 @@ def jiang_xi():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 河南
def
he_nan
():
...
...
@@ -4711,7 +4720,7 @@ def he_nan():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1690'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4750,7 +4759,7 @@ def he_nan():
'topicClassification'
:
''
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -4767,7 +4776,7 @@ def he_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 湖南
def
hu_nan
():
...
...
@@ -4828,7 +4837,7 @@ def hu_nan():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1691'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -4878,7 +4887,7 @@ def hu_nan():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 甘肃
def
gan_su
():
...
...
@@ -4963,7 +4972,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5015,7 +5024,7 @@ def gan_su():
pass
bro
.
quit
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
def
gan_su2
():
num
=
0
...
...
@@ -5097,7 +5106,7 @@ def gan_su():
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
contentWithTag
=
doc
(
'div[id="content"]'
)
print
(
title
)
#
print(title)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
try
:
...
...
@@ -5119,7 +5128,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
log
.
info
(
f
'{file_name}---{href}--'
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
...
...
@@ -5176,7 +5185,7 @@ def gan_su():
pass
bro
.
quit
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
gan_su3
():
num
=
0
...
...
@@ -5260,13 +5269,13 @@ def gan_su():
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
contentWithTag
=
doc
(
'div[id="content"]'
)
print
(
title
)
#
print(title)
if
len
(
title
)
==
0
or
contentWithTag
.
text
()
==
''
:
title
=
doc
(
'div[class="main"]>h1'
)
.
text
()
.
lstrip
()
.
strip
()
writtenDate
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'日期:'
)[
0
]
.
split
(
' '
)[
0
]
.
lstrip
()
.
strip
()
origin
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
contentWithTag
=
doc
(
'div[class="detailContent"]'
)
print
(
title
)
#
print(title)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
try
:
...
...
@@ -5288,7 +5297,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5304,7 +5313,7 @@ def gan_su():
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
print
(
bro
.
page_source
)
#
print(bro.page_source)
continue
if
len
(
content
)
<
2
:
continue
...
...
@@ -5345,7 +5354,7 @@ def gan_su():
pass
bro
.
quit
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
gan_su1
()
gan_su2
()
...
...
@@ -5401,7 +5410,7 @@ def ning_xia():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1697'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5453,7 +5462,7 @@ def ning_xia():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 陕西
def
shanxi
():
...
...
@@ -5511,7 +5520,7 @@ def shanxi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1680'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5544,7 +5553,7 @@ def shanxi():
'topicClassification'
:
""
,
'issuedNumber'
:
""
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -5563,7 +5572,7 @@ def shanxi():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 西藏
def
xi_zang
():
...
...
@@ -5617,7 +5626,7 @@ def xi_zang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1695'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5647,7 +5656,7 @@ def xi_zang():
'topicClassification'
:
""
,
'issuedNumber'
:
""
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -5664,7 +5673,7 @@ def xi_zang():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 青海
def
qing_hai
():
...
...
@@ -5722,7 +5731,7 @@ def qing_hai():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5771,7 +5780,7 @@ def qing_hai():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
qing_hai2
():
num
=
0
...
...
@@ -5849,7 +5858,7 @@ def qing_hai():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5899,7 +5908,7 @@ def qing_hai():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
qing_hai1
()
qing_hai2
()
...
...
@@ -5943,7 +5952,7 @@ def he_bei():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1668'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -5987,7 +5996,7 @@ def he_bei():
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
...
...
@@ -6002,7 +6011,7 @@ def he_bei():
except
:
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 湖北
def
hu_bei
():
...
...
@@ -6068,7 +6077,7 @@ def hu_bei():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1675'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
...
...
@@ -6120,44 +6129,45 @@ def hu_bei():
pass
driver
.
close
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
if
__name__
==
'__main__'
:
#
get_content1()
#
get_content2()
#
get_content3()
#
bei_jing()
#
nei_meng_gu()
#
ji_lin()
#
shang_hai()
#
zhe_jiang()
#
fu_jian()
#
shan_dong()
#
guang_dong()
#
hai_nan()
#
si_chuan()
#
guang_xi()
#
gui_zhou()
#
yun_nan()
#
chong_qing()
#
tian_jin()
#
xin_jiang()
#
shan_xi()
#
liao_ning()
#
hei_long_jiang()
#
jiang_su()
#
an_hui()
#
jiang_xi()
#
he_nan()
#
hu_nan()
get_content1
()
get_content2
()
get_content3
()
bei_jing
()
nei_meng_gu
()
ji_lin
()
shang_hai
()
zhe_jiang
()
fu_jian
()
shan_dong
()
guang_dong
()
hai_nan
()
si_chuan
()
guang_xi
()
gui_zhou
()
yun_nan
()
chong_qing
()
tian_jin
()
xin_jiang
()
shan_xi
()
liao_ning
()
hei_long_jiang
()
jiang_su
()
an_hui
()
jiang_xi
()
he_nan
()
hu_nan
()
gan_su
()
# ning_xia()
# xi_zang()
# shanxi()
# qing_hai()
# he_bei()
# qing_hai()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
ning_xia
()
xi_zang
()
shanxi
()
qing_hai
()
he_bei
()
qing_hai
()
current_time
=
datetime
.
datetime
.
now
()
midnight_time
=
current_time
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
+
datetime
.
timedelta
(
days
=
1
)
sleep_seconds
=
(
midnight_time
-
current_time
)
.
total_seconds
()
time
.
sleep
(
sleep_seconds
)
comData/policylaw/tingtype.py
浏览文件 @
a5ca358d
import
datetime
import
json
import
random
import
time
from
urllib.parse
import
urljoin
import
datetime
import
pymongo
from
kafka
import
KafkaProducer
from
tqdm
import
tqdm
...
...
@@ -12,15 +11,31 @@ import pymysql
import
requests
from
bs4
import
BeautifulSoup
import
urllib3
from
base.BaseCore
import
BaseCore
from
lxml
import
etree
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
def
paserUrl
(
html
,
listurl
):
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
...
...
@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
def
save_data
(
dic_news
):
aaa_dic
=
{
'附件id'
:
dic_news
[
'attachmentIds'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'附件id'
:
dic_news
[
'attachmentIds'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
]
}
db_storage
.
insert_one
(
aaa_dic
)
def
sendKafka
(
dic_news
):
start_time
=
time
.
time
()
try
:
#
114.116.116.241
try
:
#
114.116.116.241
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"policy"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
...
@@ -78,215 +94,233 @@ def sendKafka(dic_news):
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
def
work
(
href_type
,
ting_type
,
relationId
):
ip
=
baseCore
.
get_proxy
()
log
.
info
(
f
'
\n
================厅局类别==={ting_type}========================'
)
if
'http'
in
href_type
:
url_type
=
href_type
else
:
url_type
=
'http://www.sasac.gov.cn/'
+
href_type
.
replace
(
'../'
,
''
)
# print(url_type)
i_res
=
requests
.
get
(
url
=
url_type
,
headers
=
headers
,
proxies
=
ip
)
i_soup
=
BeautifulSoup
(
i_res
.
content
,
'html.parser'
)
time
.
sleep
(
2
)
news_list
=
i_soup
.
find
(
'div'
,
class_
=
'tjywBottom'
)
.
find_all
(
'li'
)
# 文章列表
# print('================新闻列表==================')
for
news
in
tqdm
(
news_list
):
try
:
news_href
=
news
.
find
(
'a'
)[
'href'
]
except
:
continue
if
'http'
in
news_href
:
news_url
=
news_href
else
:
news_url
=
'http://www.sasac.gov.cn/'
+
news_href
.
replace
(
'../'
,
''
)
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
news_url
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
news_title
=
news
.
find
(
'a'
)
.
text
.
split
(
'['
)[
0
]
log
.
info
(
f
'
\n
----正在采集: {news_title}-------'
)
pub_time
=
news
.
find
(
'span'
)
.
text
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# 文章信息
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Cookie'
:
'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Proxy-Connection'
:
'keep-alive'
,
'Referer'
:
'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res
=
requests
.
get
(
url
=
news_url
,
headers
=
header
,
proxies
=
ip
)
ii_soup
=
BeautifulSoup
(
ii_res
.
content
,
'html.parser'
)
# todo:相对路径转化为绝对路径
ii_soup
=
paserUrl
(
ii_soup
,
news_url
)
# 去掉扫一扫
try
:
ii_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
# 去掉style标签
for
styleTag
in
ii_soup
.
find_all
(
'style'
):
styleTag
.
extract
()
time
.
sleep
(
2
)
try
:
news_info
=
ii_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
except
Exception
as
e
:
log
.
error
(
e
)
news_info
=
''
if
news_info
:
# 国资委_内设机构
def
gzw_nsjg
():
# 获取页面数据
def
get_page_nsjg
(
href
,
ting_type
,
relationId
,
page
):
start_time
=
time
.
time
()
num
=
0
for
pageNo
in
range
(
1
,
page
+
1
):
if
pageNo
!=
1
:
href
=
href
.
replace
(
f
'_{pageNo - 1}.html'
,
f
'_{pageNo}.html'
)
if
pageNo
==
page
:
tag
=
href
.
split
(
'/'
)[
-
1
]
href
=
href
.
replace
(
tag
,
'index.html'
)
try
:
# origin
pub_source
=
news_info
.
find
(
'p'
)
.
text
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间'
)[
0
]
.
strip
()
req
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
li_list
=
soup
.
find
(
'ul'
,
attrs
=
{
'class'
:
'ld-tjywList'
})
.
find_all
(
'li'
)
except
:
pub_source
=
''
try
:
contentWithTag
=
ii_soup
.
find
(
'div'
,
'zsy_comain'
)
content
=
contentWithTag
.
text
.
strip
()
except
:
content
=
''
contentWithTag
=
''
if
len
(
content
)
>
100
:
pass
else
:
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
'attachmentIds'
:
[],
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
relationId
,
'relationName'
:
ting_type
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_source
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
news_url
,
'summary'
:
''
,
'title'
:
news_title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
f
'{ting_type}-----{news_title}----发送成功'
,
)
else
:
dic_error
=
{
'标题'
:
news_title
,
'原文链接'
:
news_url
,
'厅局类别'
:
ting_type
}
log
.
error
(
dic_error
)
req
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
try
:
real_href
=
li
.
find
(
'a'
)
.
get
(
'href'
)
except
:
continue
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
try
:
try
:
try
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
except
:
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
except
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
pub_result
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
real_href
=
str
(
pub_result
.
text
)
.
split
(
'location.href="'
)[
1
]
.
split
(
'";'
)[
0
]
.
lstrip
()
.
strip
()
req_
.
close
()
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
except
:
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
req_
.
close
()
except
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
yaoqiu_list
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'yaoqiu_list'
})
li_list_
=
yaoqiu_list
.
find_all
(
'li'
)
for
li_
in
li_list_
:
href_
=
li_
.
find
(
'a'
)
.
get
(
'href'
)
real_href
=
href_
.
replace
(
'../../../'
,
'http://www.sasac.gov.cn/'
)
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
except
:
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
''
if
'location.href'
in
title
:
continue
if
'404 Ba'
in
str
(
div_content
):
continue
contentWithTag
=
div_content
.
find
(
'div'
,
class_
=
'zsy_comain'
)
try
:
contentWithTag
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
# 去掉style标签
for
styleTag
in
contentWithTag
.
find_all
(
'style'
):
styleTag
.
extract
()
content
=
contentWithTag
.
text
if
content
==
''
:
log
.
error
(
f
'{real_href}===获取正文失败'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
'attachmentIds'
:
[],
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
relationId
,
'relationName'
:
ting_type
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_source
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
publishDate
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'title'
:
title
}
#print(content)
#print(contentWithTag)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
f
'{ting_type}-----{title}----发送成功'
,
)
num
+=
1
except
Exception
as
e
:
pass
req
.
close
()
end_time
=
time
.
time
()
print
(
f
'抓取{num}条数据,共耗时{end_time - start_time}'
)
# 获取页面列表
def
get_page_nsjg_list
(
href
,
institution
,
tid
):
href_list
=
{
'办公厅(党委办公厅)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html'
,
9
],
'综合研究局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html'
,
5
],
'政策法规局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html'
,
21
],
'规划发展局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html'
,
9
],
'财务监管与运行评价局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html'
,
9
],
'产权管理局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html'
,
7
],
'企业改革局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html'
,
15
],
'考核分配局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html'
,
6
],
'资本运营与收益管理局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html'
,
3
],
'科技创新局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html'
,
14
],
'社会责任局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html'
,
10
],
'综合监督局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html'
,
1
],
'监督追责局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html'
,
2
],
'企业领导人员管理一局(董事会工作局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html'
,
4
],
'企业领导人员管理二局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html'
,
4
],
'党建工作局(党委组织部、党委统战部)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html'
,
14
],
'宣传工作局(党委宣传部)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html'
,
21
],
'国际合作局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html'
,
28
],
'人事局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html'
,
7
],
'行业协会商会党建工作局(行业协会商会工作局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html'
,
4
],
'机关服务管理局(离退休干部管理局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html'
,
9
],
'机关党委'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html'
,
33
],
'党委巡视工作办公室、国资委巡视组'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html'
,
8
],
'中央纪委国家监委驻国资委纪检监察组'
:
[
'http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html'
,
18
]}
href_
=
href_list
[
institution
][
0
]
page
=
href_list
[
institution
][
1
]
get_page_nsjg
(
href_
,
institution
,
tid
,
page
)
#中央纪委国家监委驻国资委纪检监察组
def
job1
(
a_type
):
href
=
a_type
[
'href'
]
ting_type
=
a_type
.
text
return
href
,
ting_type
# 开始
def
gzw_nsjg_start
():
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
all_institution
=
[]
tree
=
etree
.
HTML
(
req_text
)
institution
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
institution_href
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href'
)[
0
]
.
replace
(
'../'
,
'http://www.sasac.gov.cn/'
)
all_institution
.
append
([
institution
,
institution_href
])
dd_list
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[2]/dd'
)
for
dd
in
dd_list
:
institution
=
dd
.
xpath
(
'./a/text()'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
institution_href
=
dd
.
xpath
(
'./a/@href'
)[
0
]
.
replace
(
'../'
,
'http://www.sasac.gov.cn/'
)
all_institution
.
append
([
institution
,
institution_href
])
def
job
():
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
time
.
sleep
(
2
)
# 厅局列表
list_type
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dd'
)[:
22
]
a_soup
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dt'
)[
0
]
a_type
=
a_soup
.
text
.
strip
()
a_href
=
a_soup
.
find
(
'a'
)[
'href'
]
a_id
=
'1874'
list_error
=
[]
num
=
0
start_time
=
time
.
time
()
work
(
a_href
,
a_type
,
a_id
)
for
type
in
tqdm
(
list_type
):
list_news
=
[]
href_type
=
type
.
find
(
'a'
)[
'href'
]
ting_type
=
type
.
find
(
'a'
)
.
text
try
:
relationId
=
mapId_dic
[
ting_type
]
except
:
continue
work
(
href_type
,
ting_type
,
relationId
)
num
+=
1
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
time
.
sleep
(
1
)
# writer.save()
# df_error = pd.DataFrame(list_error)
# df_error.to_excel('未采到文章.xlsx',index=False)
tids
=
{
'办公厅(党委办公厅)'
:
1643
,
'综合研究局'
:
1644
,
'政策法规局'
:
1645
,
'规划发展局'
:
1646
,
'财务监管与运行评价局'
:
1647
,
'产权管理局'
:
1648
,
'企业改革局'
:
1649
,
'考核分配局'
:
1650
,
'资本运营与收益管理局'
:
1651
,
'科技创新局'
:
1652
,
'社会责任局'
:
2064
,
'综合监督局'
:
1653
,
'监督追责局'
:
1654
,
'企业领导人员管理一局(董事会工作局)'
:
1655
,
'企业领导人员管理二局'
:
1656
,
'党建工作局(党委组织部、党委统战部)'
:
1657
,
'宣传工作局(党委宣传部)'
:
1658
,
'国际合作局'
:
1659
,
'人事局'
:
1660
,
'行业协会商会党建工作局(行业协会商会工作局)'
:
1661
,
'机关服务管理局(离退休干部管理局)'
:
1662
,
'机关党委'
:
1663
,
'党委巡视工作办公室、国资委巡视组'
:
1664
,
'中央纪委国家监委驻国资委纪检监察组'
:
1874
}
for
a
in
all_institution
:
institution
=
a
[
0
]
href
=
a
[
1
]
tid
=
tids
[
institution
]
log
.
info
(
f
'
\n
================厅局类别==={institution}========================'
)
get_page_nsjg_list
(
href
,
institution
,
tid
)
gzw_nsjg_start
()
if
__name__
==
'__main__'
:
mapId_dic
=
{
'办公厅(党委办公厅)'
:
'1643'
,
'综合研究局'
:
'1644'
,
'政策法规局'
:
'1645'
,
'规划发展局'
:
'1646'
,
'财务监管与运行评价局'
:
'1647'
,
'产权管理局'
:
'1648'
,
'企业改革局'
:
'1649'
,
'考核分配局'
:
'1650'
,
'资本运营与收益管理局'
:
'1651'
,
'科技创新局'
:
'1652'
,
'综合监督局'
:
'1653'
,
'监督追责局'
:
'1654'
,
'企业领导人员管理一局(董事会工作局)'
:
'1655'
,
'企业领导人员管理二局'
:
'1656'
,
'党建工作局(党委组织部、党委统战部)'
:
'1657'
,
'宣传工作局(党委宣传部)'
:
'1658'
,
'国际合作局'
:
'1659'
,
'人事局'
:
'1660'
,
'机关服务管理局(离退休干部管理局)'
:
'1662'
,
'机关党委'
:
'1663'
,
'党委巡视工作办公室、国资委巡视组'
:
'1664'
,
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
if
__name__
==
'__main__'
:
try
:
job
()
gzw_nsjg
()
except
Exception
as
e
:
print
(
e
)
current_time
=
datetime
.
datetime
.
now
()
midnight_time
=
current_time
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
+
datetime
.
timedelta
(
days
=
1
)
sleep_seconds
=
(
midnight_time
-
current_time
)
.
total_seconds
()
time
.
sleep
(
sleep_seconds
)
log
.
error
(
e
)
#
current_time = datetime.datetime.now()
#
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
#
sleep_seconds = (midnight_time - current_time).total_seconds()
#
time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论