Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
4d6ca3e2
提交
4d6ca3e2
authored
10月 21, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规采集 10/21
上级
aa593218
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
405 行增加
和
361 行删除
+405
-361
policy.py
comData/policylaw/policy.py
+194
-184
tingtype.py
comData/policylaw/tingtype.py
+211
-177
没有找到文件。
comData/policylaw/policy.py
浏览文件 @
4d6ca3e2
# _*_ coding:utf-8 _*_
# _*_ coding:utf-8 _*_
"""数据全量跑一遍,不做判重逻辑"""
"""数据全量跑一遍,不做判重逻辑"""
import
datetime
import
json
import
json
import
os
import
re
import
re
import
time
import
time
import
datetime
import
fitz
import
fitz
import
pymongo
import
pymongo
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
pyquery
import
PyQuery
as
pq
from
pyquery
import
PyQuery
as
pq
from
requests.packages
import
urllib3
from
requests.packages
import
urllib3
from
requests.adapters
import
HTTPAdapter
from
urllib.parse
import
urljoin
from
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
urllib3
.
disable_warnings
()
urllib3
.
disable_warnings
()
...
@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
...
@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.by
import
By
from
lxml
import
etree
from
lxml
import
etree
from
random
import
choice
from
random
import
choice
from
bs4
import
BeautifulSoup
from
requests.adapters
import
HTTPAdapter
from
urllib.parse
import
urljoin
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
taskType
=
'政策法规'
taskType
=
'政策法规'
...
@@ -36,11 +34,10 @@ taskType = '政策法规'
...
@@ -36,11 +34,10 @@ taskType = '政策法规'
各地方国资委
各地方国资委
"""
"""
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
'国务院_国资委_copy1'
]
driver_path
=
r'F:\spider
\cmd100\chromedriver.exe'
driver_path
=
r'D:
\cmd100\chromedriver.exe'
chromr_bin
=
r'F:\spider
\Google\Chrome\Application\chrome.exe'
chromr_bin
=
r'D:
\Google\Chrome\Application\chrome.exe'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
...
@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
...
@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
def
getDriver
():
def
getDriver
():
service
=
Service
(
driver_path
)
service
=
Service
(
driver_path
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
add_argument
(
'--headless'
)
#
chrome_options.add_argument('--headless')
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--disable-gpu'
)
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--no-sandbox')
chrome_options
.
add_argument
(
'log-level=3'
)
chrome_options
.
add_argument
(
'--disable-dev-shm-usage'
)
chrome_options
.
add_argument
(
'--disable-dev-shm-usage'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 屏蔽chrome自动化受控提示
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 屏蔽chrome自动化受控提示
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
# 禁用启用Blink运行时的功能去掉webdriver痕迹
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
# 禁用启用Blink运行时的功能去掉webdriver痕迹
...
@@ -77,6 +75,12 @@ def getDriver():
...
@@ -77,6 +75,12 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
)
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
)
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
executable_path
=
driver_path
)
bro
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
executable_path
=
driver_path
)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return
bro
return
bro
def
save_data
(
dic_news
):
def
save_data
(
dic_news
):
...
@@ -203,8 +207,9 @@ def get_content1():
...
@@ -203,8 +207,9 @@ def get_content1():
s
.
keep_alive
=
False
s
.
keep_alive
=
False
pcodeJiguan
=
a_list
[
0
]
pcodeJiguan
=
a_list
[
0
]
try
:
try
:
pageCount
=
getPageConunt
(
a_list
,
url
,
headers
,
s
)
#pageCount = getPageConunt(a_list, url, headers, s)
for
pageNo
in
range
(
1
,
pageCount
+
1
):
#for pageNo in range(1, pageCount + 1):
pageNo
=
1
try
:
try
:
try
:
try
:
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
page_list
=
getList
(
a_list
,
url
,
headers
,
pageNo
,
s
)
...
@@ -224,6 +229,7 @@ def get_content1():
...
@@ -224,6 +229,7 @@ def get_content1():
if
is_href
:
if
is_href
:
num
+=
1
num
+=
1
log
.
info
(
'已采集----------跳过'
)
log
.
info
(
'已采集----------跳过'
)
time
.
sleep
(
0.5
)
continue
continue
try
:
try
:
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
resp_href
=
requests
.
get
(
url
=
href
,
headers
=
headers_
,
verify
=
False
)
...
@@ -305,7 +311,7 @@ def get_content1():
...
@@ -305,7 +311,7 @@ def get_content1():
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
log
.
error
(
f
'{pcodeJiguan}...获取总数失败'
)
continue
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院文件{num}条数据,共耗时{start_time - end
_time}'
)
log
.
info
(
f
'共抓取国务院文件{num}条数据,共耗时{end_time-start
_time}'
)
# 国务院部门文件
# 国务院部门文件
def
get_content2
():
def
get_content2
():
...
@@ -355,9 +361,11 @@ def get_content2():
...
@@ -355,9 +361,11 @@ def get_content2():
'国家知识产权局'
,
'国家档案局'
,
'国家保密局'
,
'国家密码管理局'
,
'国家宗教事务局'
,
'国务院台湾事务办公室'
,
'国家乡村振兴局'
,
'国家电影局'
]
'国家知识产权局'
,
'国家档案局'
,
'国家保密局'
,
'国家密码管理局'
,
'国家宗教事务局'
,
'国务院台湾事务办公室'
,
'国家乡村振兴局'
,
'国家电影局'
]
for
bmfl
in
result_list
:
for
bmfl
in
result_list
:
try
:
#try:
totalpage
=
getTotalpage
(
bmfl
,
headers
,
session
)
#totalpage = getTotalpage(bmfl,headers,session)
for
pageNo
in
range
(
1
,
totalpage
+
1
):
#for pageNo in range(1,totalpage+1):
#for pageNo in range(1,6):
pageNo
=
1
try
:
try
:
try
:
try
:
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
content_list
=
getContentList
(
bmfl
,
pageNo
,
headers
,
session
)
...
@@ -373,12 +381,12 @@ def get_content2():
...
@@ -373,12 +381,12 @@ def get_content2():
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time
=
int
(
content_dict
[
'pubtime'
]
/
1000
)
# 发布时间
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
pub_time1
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
pub_time
))
except
:
except
:
pub_time1
=
''
pub_time1
=
None
try
:
try
:
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
p_time
=
int
(
content_dict
[
'ptime'
]
/
1000
)
# 成文时间
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
pub_time2
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
(
p_time
))
except
:
except
:
pub_time2
=
''
pub_time2
=
None
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
pub_org
=
content_dict
[
'puborg'
]
# 发文机关
try
:
try
:
child_type
=
content_dict
[
'childtype'
]
# 主题分类
child_type
=
content_dict
[
'childtype'
]
# 主题分类
...
@@ -389,6 +397,7 @@ def get_content2():
...
@@ -389,6 +397,7 @@ def get_content2():
if
is_href
:
if
is_href
:
num
+=
1
num
+=
1
log
.
info
(
'已采集----------跳过'
)
log
.
info
(
'已采集----------跳过'
)
time
.
sleep
(
0.5
)
continue
continue
try
:
try
:
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
...
@@ -460,9 +469,9 @@ def get_content2():
...
@@ -460,9 +469,9 @@ def get_content2():
except
:
except
:
log
.
error
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
log
.
error
(
f
'{bmfl}...第{pageNo}页获取信息列表失败'
)
continue
continue
except
:
#
except:
log
.
error
(
f
'{bmfl}...获取页数失败'
)
#
log.error(f'{bmfl}...获取页数失败')
continue
#
continue
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}'
)
...
@@ -553,7 +562,7 @@ def get_content3():
...
@@ -553,7 +562,7 @@ def get_content3():
'topicClassification'
:
''
,
#政策文件分类
'topicClassification'
:
''
,
#政策文件分类
'issuedNumber'
:
pub_hao
,
#发文字号
'issuedNumber'
:
pub_hao
,
#发文字号
'publishDate'
:
pub_time
,
#发布时间
'publishDate'
:
pub_time
,
#发布时间
'writtenDate'
:
''
,
#成文时间
'writtenDate'
:
None
,
#成文时间
'sid'
:
'1697458829758697473'
,
#信息源id
'sid'
:
'1697458829758697473'
,
#信息源id
'sourceAddress'
:
href
,
#原文链接
'sourceAddress'
:
href
,
#原文链接
'summary'
:
''
,
#摘要
'summary'
:
''
,
#摘要
...
@@ -744,7 +753,7 @@ def bei_jing():
...
@@ -744,7 +753,7 @@ def bei_jing():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1667'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1667'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -870,7 +879,7 @@ def nei_meng_gu():
...
@@ -870,7 +879,7 @@ def nei_meng_gu():
fu_jian_re
=
str
(
real_href
)
.
split
(
'/t'
)[
0
]
+
'/'
+
str
(
fu_jian_re
)
.
split
(
'./'
)[
1
]
fu_jian_re
=
str
(
real_href
)
.
split
(
'/t'
)[
0
]
+
'/'
+
str
(
fu_jian_re
)
.
split
(
'./'
)[
1
]
fu_jian_href
=
fu_jian_re
fu_jian_href
=
fu_jian_re
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
title
:
if
category
not
in
title
:
file_name
=
title
+
category
file_name
=
title
+
category
# print(fu_jian_href)
# print(fu_jian_href)
# todo:附件上传至文件服务器
# todo:附件上传至文件服务器
...
@@ -918,7 +927,7 @@ def nei_meng_gu():
...
@@ -918,7 +927,7 @@ def nei_meng_gu():
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 吉林
# 吉林
def
ji_lin
():
def
ji_lin
():
...
@@ -982,7 +991,7 @@ def ji_lin():
...
@@ -982,7 +991,7 @@ def ji_lin():
# print(pub_come)
# print(pub_come)
i_content
=
soup
.
find
(
class_
=
'zsy_comain'
)
i_content
=
soup
.
find
(
class_
=
'zsy_comain'
)
if
i_content
:
if
i_content
:
print
(
real_href
)
#
print(real_href)
# 去掉扫一扫
# 去掉扫一扫
try
:
try
:
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
...
@@ -1020,7 +1029,7 @@ def ji_lin():
...
@@ -1020,7 +1029,7 @@ def ji_lin():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
file_name
=
fu_jian_href
.
text
.
strip
()
file_name
=
fu_jian_href
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# print(fu_jian_href)
# print(fu_jian_href)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1670'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1670'
,
pathType
,
file_name
)
...
@@ -1065,7 +1074,7 @@ def ji_lin():
...
@@ -1065,7 +1074,7 @@ def ji_lin():
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
# print(fj_href)
# print(fj_href)
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1670'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1670'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -1104,7 +1113,7 @@ def ji_lin():
...
@@ -1104,7 +1113,7 @@ def ji_lin():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -1126,7 +1135,7 @@ def ji_lin():
...
@@ -1126,7 +1135,7 @@ def ji_lin():
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 上海
# 上海
def
shang_hai
():
def
shang_hai
():
...
@@ -1219,7 +1228,7 @@ def shang_hai():
...
@@ -1219,7 +1228,7 @@ def shang_hai():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1671'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1671'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -1252,7 +1261,7 @@ def shang_hai():
...
@@ -1252,7 +1261,7 @@ def shang_hai():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -1268,7 +1277,7 @@ def shang_hai():
...
@@ -1268,7 +1277,7 @@ def shang_hai():
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 浙江
# 浙江
def
zhe_jiang
():
def
zhe_jiang
():
...
@@ -1376,7 +1385,7 @@ def zhe_jiang():
...
@@ -1376,7 +1385,7 @@ def zhe_jiang():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -1393,7 +1402,7 @@ def zhe_jiang():
...
@@ -1393,7 +1402,7 @@ def zhe_jiang():
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 福建
# 福建
def
fu_jian
():
def
fu_jian
():
...
@@ -1445,7 +1454,7 @@ def fu_jian():
...
@@ -1445,7 +1454,7 @@ def fu_jian():
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
real_href
=
href
real_href
=
href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print
(
real_href
)
#
print(real_href)
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
if
is_href
:
num
+=
1
num
+=
1
...
@@ -1460,7 +1469,7 @@ def fu_jian():
...
@@ -1460,7 +1469,7 @@ def fu_jian():
content
=
baseCore
.
pdf_content
(
resp_content
)
content
=
baseCore
.
pdf_content
(
resp_content
)
contentwithtag
=
''
contentwithtag
=
''
category
=
os
.
path
.
splitext
(
real_href
)[
1
]
category
=
os
.
path
.
splitext
(
real_href
)[
1
]
if
category
not
in
title
:
if
category
not
in
title
:
file_name
=
title
+
category
file_name
=
title
+
category
# 文件上传至服务器
# 文件上传至服务器
retData
=
baseCore
.
uptoOBS
(
real_href
,
'1673'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
real_href
,
'1673'
,
pathType
,
file_name
)
...
@@ -1471,7 +1480,7 @@ def fu_jian():
...
@@ -1471,7 +1480,7 @@ def fu_jian():
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'福建省国资委'
,
file_name
,
num
,
''
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'福建省国资委'
,
file_name
,
num
,
''
)
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
pub_hao
=
''
pub_hao
=
''
pub_time
=
''
pub_time
=
None
pub_source
=
''
pub_source
=
''
else
:
else
:
...
@@ -1508,7 +1517,7 @@ def fu_jian():
...
@@ -1508,7 +1517,7 @@ def fu_jian():
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
or
'.XLS'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
print
(
fj_href
)
print
(
fj_href
)
# 找到附件后 上传至文件服务器
# 找到附件后 上传至文件服务器
...
@@ -1524,7 +1533,7 @@ def fu_jian():
...
@@ -1524,7 +1533,7 @@ def fu_jian():
except
:
except
:
pub_source
=
''
pub_source
=
''
pub_time
=
''
pub_time
=
None
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
content
=
contentwithtag
.
text
.
strip
()
content
=
contentwithtag
.
text
.
strip
()
if
content
==
''
or
content
==
None
:
if
content
==
''
or
content
==
None
:
...
@@ -1548,7 +1557,7 @@ def fu_jian():
...
@@ -1548,7 +1557,7 @@ def fu_jian():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -1566,7 +1575,7 @@ def fu_jian():
...
@@ -1566,7 +1575,7 @@ def fu_jian():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 山东
# 山东
def
shan_dong
():
def
shan_dong
():
...
@@ -1633,7 +1642,7 @@ def shan_dong():
...
@@ -1633,7 +1642,7 @@ def shan_dong():
for
h1
in
h1_list
:
for
h1
in
h1_list
:
title
=
title
+
str
(
h1
.
text
)
title
=
title
+
str
(
h1
.
text
)
title
.
strip
()
.
lstrip
()
title
.
strip
()
.
lstrip
()
pub_time
=
''
pub_time
=
None
span_list
=
source
.
find_all
(
'span'
)
span_list
=
source
.
find_all
(
'span'
)
i
=
0
i
=
0
for
span
in
span_list
:
for
span
in
span_list
:
...
@@ -1683,7 +1692,7 @@ def shan_dong():
...
@@ -1683,7 +1692,7 @@ def shan_dong():
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 广东
# 广东
def
guang_dong
():
def
guang_dong
():
...
@@ -1745,7 +1754,7 @@ def guang_dong():
...
@@ -1745,7 +1754,7 @@ def guang_dong():
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.rar'
in
fj_href
or
'.ppt'
in
fj_href
or
'.PDF'
in
fj_href
or
'.DOC'
in
fj_href
\
or
'.xlsx'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
or
'.xlsx'
in
fj_href
or
'.ZIP'
in
fj_href
or
'.RAR'
in
fj_href
:
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1676'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fj_href
,
'1676'
,
pathType
,
file_name
)
...
@@ -1774,7 +1783,7 @@ def guang_dong():
...
@@ -1774,7 +1783,7 @@ def guang_dong():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -1792,7 +1801,7 @@ def guang_dong():
...
@@ -1792,7 +1801,7 @@ def guang_dong():
except
:
except
:
pass
pass
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
log
.
info
(
'共'
,
count
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 海南
# 海南
def
hai_nan
():
def
hai_nan
():
...
@@ -1869,7 +1878,7 @@ def hai_nan():
...
@@ -1869,7 +1878,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 上传至文件服务器
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
@@ -1916,7 +1925,7 @@ def hai_nan():
...
@@ -1916,7 +1925,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器
# 附件上传至文件服务器
...
@@ -1995,7 +2004,7 @@ def hai_nan():
...
@@ -1995,7 +2004,7 @@ def hai_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
hai_nan2
():
def
hai_nan2
():
def
hai_nan_sw
(
page_href
):
def
hai_nan_sw
(
page_href
):
...
@@ -2126,7 +2135,7 @@ def hai_nan():
...
@@ -2126,7 +2135,7 @@ def hai_nan():
pub_source
=
''
pub_source
=
''
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_hao
=
''
pub_hao
=
''
writtenDate
=
''
writtenDate
=
None
,
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
if
content
==
''
or
content
==
None
:
...
@@ -2143,7 +2152,7 @@ def hai_nan():
...
@@ -2143,7 +2152,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 上传至文件服务器
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
@@ -2241,7 +2250,7 @@ def hai_nan():
...
@@ -2241,7 +2250,7 @@ def hai_nan():
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_time
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
pub_hao
=
''
pub_hao
=
''
pub_source
=
''
pub_source
=
''
writtenDate
=
''
writtenDate
=
None
,
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
contentWithTag
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'xxgk_content_content'
})
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
if
content
==
''
or
content
==
None
:
if
content
==
''
or
content
==
None
:
...
@@ -2259,7 +2268,7 @@ def hai_nan():
...
@@ -2259,7 +2268,7 @@ def hai_nan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 上传至文件服务器
# 上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1677'
,
pathType
,
file_name
)
...
@@ -2360,7 +2369,7 @@ def hai_nan():
...
@@ -2360,7 +2369,7 @@ def hai_nan():
0
]
.
strip
()
0
]
.
strip
()
except
:
except
:
pub_source
=
''
pub_source
=
''
pub_time
=
''
pub_time
=
None
pub_hao
=
''
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
class_
=
'pages_content'
)
contentWithTag
=
doc_href
.
find
(
class_
=
'pages_content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
...
@@ -2383,7 +2392,7 @@ def hai_nan():
...
@@ -2383,7 +2392,7 @@ def hai_nan():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
i_href
,
'sourceAddress'
:
i_href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -2479,7 +2488,7 @@ def hai_nan():
...
@@ -2479,7 +2488,7 @@ def hai_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
start
()
start
()
hai_nan1
()
hai_nan1
()
...
@@ -2538,7 +2547,7 @@ def si_chuan():
...
@@ -2538,7 +2547,7 @@ def si_chuan():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 对附件上传至文件服务器
# 对附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1678'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1678'
,
pathType
,
file_name
)
...
@@ -2567,7 +2576,7 @@ def si_chuan():
...
@@ -2567,7 +2576,7 @@ def si_chuan():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -2585,7 +2594,7 @@ def si_chuan():
...
@@ -2585,7 +2594,7 @@ def si_chuan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 广西
# 广西
def
guang_xi
():
def
guang_xi
():
...
@@ -2671,7 +2680,7 @@ def guang_xi():
...
@@ -2671,7 +2680,7 @@ def guang_xi():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1692'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1692'
,
pathType
,
file_name
)
...
@@ -2701,7 +2710,7 @@ def guang_xi():
...
@@ -2701,7 +2710,7 @@ def guang_xi():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -2718,7 +2727,7 @@ def guang_xi():
...
@@ -2718,7 +2727,7 @@ def guang_xi():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 贵州
# 贵州
def
gui_zhou
():
def
gui_zhou
():
...
@@ -2788,7 +2797,7 @@ def gui_zhou():
...
@@ -2788,7 +2797,7 @@ def gui_zhou():
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1694'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1694'
,
pathType
,
file_name
)
...
@@ -2818,7 +2827,7 @@ def gui_zhou():
...
@@ -2818,7 +2827,7 @@ def gui_zhou():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -2836,7 +2845,7 @@ def gui_zhou():
...
@@ -2836,7 +2845,7 @@ def gui_zhou():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
# 云南
# 云南
def
yun_nan
():
def
yun_nan
():
...
@@ -2870,7 +2879,7 @@ def yun_nan():
...
@@ -2870,7 +2879,7 @@ def yun_nan():
continue
continue
try
:
try
:
fu_jian_href_list
=
[]
fu_jian_href_list
=
[]
print
(
href
)
#
print(href)
if
'.shtml'
in
href
:
if
'.shtml'
in
href
:
href_resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
href_resp
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
href_resp
.
encoding
=
href_resp
.
apparent_encoding
href_resp
.
encoding
=
href_resp
.
apparent_encoding
...
@@ -2901,7 +2910,7 @@ def yun_nan():
...
@@ -2901,7 +2910,7 @@ def yun_nan():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
try
:
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
...
@@ -2939,8 +2948,8 @@ def yun_nan():
...
@@ -2939,8 +2948,8 @@ def yun_nan():
'organ'
:
''
,
'organ'
:
''
,
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
''
,
'publishDate'
:
None
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -2959,7 +2968,7 @@ def yun_nan():
...
@@ -2959,7 +2968,7 @@ def yun_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
yun_nan2
():
def
yun_nan2
():
num
=
0
num
=
0
...
@@ -3022,7 +3031,7 @@ def yun_nan():
...
@@ -3022,7 +3031,7 @@ def yun_nan():
# print(fu_jian_href)
# print(fu_jian_href)
try
:
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1679'
,
pathType
,
file_name
)
...
@@ -3060,7 +3069,7 @@ def yun_nan():
...
@@ -3060,7 +3069,7 @@ def yun_nan():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -3079,7 +3088,7 @@ def yun_nan():
...
@@ -3079,7 +3088,7 @@ def yun_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
yun_nan1
()
yun_nan1
()
yun_nan2
()
yun_nan2
()
...
@@ -3148,8 +3157,8 @@ def chong_qing():
...
@@ -3148,8 +3157,8 @@ def chong_qing():
except
:
except
:
origin
=
''
origin
=
''
topicClassification
=
''
topicClassification
=
''
pub_time
=
''
pub_time
=
None
writtenDate
=
''
writtenDate
=
None
pub_hao
=
''
pub_hao
=
''
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-content'
)
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-content'
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
...
@@ -3169,7 +3178,7 @@ def chong_qing():
...
@@ -3169,7 +3178,7 @@ def chong_qing():
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
try
:
try
:
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
# 附件上传至文件服务器
# 附件上传至文件服务器
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1693'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1693'
,
pathType
,
file_name
)
...
@@ -3219,7 +3228,7 @@ def chong_qing():
...
@@ -3219,7 +3228,7 @@ def chong_qing():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 天津
# 天津
def
tian_jin
():
def
tian_jin
():
...
@@ -3282,7 +3291,7 @@ def tian_jin():
...
@@ -3282,7 +3291,7 @@ def tian_jin():
rmtag2
.
remove
()
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
@@ -3298,7 +3307,7 @@ def tian_jin():
...
@@ -3298,7 +3307,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3351,7 +3360,7 @@ def tian_jin():
...
@@ -3351,7 +3360,7 @@ def tian_jin():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin2
():
def
tian_jin2
():
"""
"""
...
@@ -3413,7 +3422,7 @@ def tian_jin():
...
@@ -3413,7 +3422,7 @@ def tian_jin():
rmtag2
.
remove
()
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
@@ -3429,7 +3438,7 @@ def tian_jin():
...
@@ -3429,7 +3438,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3482,7 +3491,7 @@ def tian_jin():
...
@@ -3482,7 +3491,7 @@ def tian_jin():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin3
():
def
tian_jin3
():
num
=
0
num
=
0
...
@@ -3507,7 +3516,7 @@ def tian_jin():
...
@@ -3507,7 +3516,7 @@ def tian_jin():
try
:
try
:
publishDate
=
li
.
find
(
'div'
,
attrs
=
{
'class'
:
'other'
})
.
text
publishDate
=
li
.
find
(
'div'
,
attrs
=
{
'class'
:
'other'
})
.
text
except
:
except
:
publishDate
=
''
publishDate
=
None
if
'http'
not
in
href
:
if
'http'
not
in
href
:
if
'../../../'
in
href
:
if
'../../../'
in
href
:
href
=
href
.
replace
(
'../../../'
,
'https://sasac.tj.gov.cn/'
)
href
=
href
.
replace
(
'../../../'
,
'https://sasac.tj.gov.cn/'
)
...
@@ -3548,7 +3557,7 @@ def tian_jin():
...
@@ -3548,7 +3557,7 @@ def tian_jin():
rmtag2
.
remove
()
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
if
len
(
writtenDate
)
<
1
:
writtenDate
=
''
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
...
@@ -3564,7 +3573,7 @@ def tian_jin():
...
@@ -3564,7 +3573,7 @@ def tian_jin():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3617,7 +3626,7 @@ def tian_jin():
...
@@ -3617,7 +3626,7 @@ def tian_jin():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
tian_jin1
()
tian_jin1
()
tian_jin2
()
tian_jin2
()
...
@@ -3673,7 +3682,7 @@ def xin_jiang():
...
@@ -3673,7 +3682,7 @@ def xin_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3717,7 +3726,7 @@ def xin_jiang():
...
@@ -3717,7 +3726,7 @@ def xin_jiang():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -3734,7 +3743,7 @@ def xin_jiang():
...
@@ -3734,7 +3743,7 @@ def xin_jiang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
xin_jiang_jsbt
():
def
xin_jiang_jsbt
():
num
=
0
num
=
0
...
@@ -3780,7 +3789,7 @@ def xin_jiang():
...
@@ -3780,7 +3789,7 @@ def xin_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1682'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3824,7 +3833,7 @@ def xin_jiang():
...
@@ -3824,7 +3833,7 @@ def xin_jiang():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -3843,7 +3852,7 @@ def xin_jiang():
...
@@ -3843,7 +3852,7 @@ def xin_jiang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
xin_jiang1
()
xin_jiang1
()
xin_jiang_jsbt
()
xin_jiang_jsbt
()
...
@@ -3881,7 +3890,7 @@ def shan_xi():
...
@@ -3881,7 +3890,7 @@ def shan_xi():
try
:
try
:
if
".pdf"
in
href
:
if
".pdf"
in
href
:
content
=
''
content
=
''
publishDate
=
''
publishDate
=
None
origin
=
''
origin
=
''
fu_jian_soup
=
[
href
]
fu_jian_soup
=
[
href
]
contentWithTag
=
''
contentWithTag
=
''
...
@@ -3908,7 +3917,7 @@ def shan_xi():
...
@@ -3908,7 +3917,7 @@ def shan_xi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1684'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1684'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -3952,7 +3961,7 @@ def shan_xi():
...
@@ -3952,7 +3961,7 @@ def shan_xi():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -3969,7 +3978,7 @@ def shan_xi():
...
@@ -3969,7 +3978,7 @@ def shan_xi():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 辽宁
# 辽宁
def
liao_ning
():
def
liao_ning
():
...
@@ -4028,7 +4037,7 @@ def liao_ning():
...
@@ -4028,7 +4037,7 @@ def liao_ning():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1685'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1685'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4071,7 +4080,7 @@ def liao_ning():
...
@@ -4071,7 +4080,7 @@ def liao_ning():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -4088,7 +4097,7 @@ def liao_ning():
...
@@ -4088,7 +4097,7 @@ def liao_ning():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
# 黑龙江
# 黑龙江
def
hei_long_jiang
():
def
hei_long_jiang
():
...
@@ -4141,7 +4150,7 @@ def hei_long_jiang():
...
@@ -4141,7 +4150,7 @@ def hei_long_jiang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4174,7 +4183,7 @@ def hei_long_jiang():
...
@@ -4174,7 +4183,7 @@ def hei_long_jiang():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -4193,7 +4202,7 @@ def hei_long_jiang():
...
@@ -4193,7 +4202,7 @@ def hei_long_jiang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 江苏
# 江苏
def
jiang_su
():
def
jiang_su
():
...
@@ -4257,7 +4266,7 @@ def jiang_su():
...
@@ -4257,7 +4266,7 @@ def jiang_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1687'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4314,7 +4323,7 @@ def jiang_su():
...
@@ -4314,7 +4323,7 @@ def jiang_su():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 安徽
# 安徽
def
an_hui
():
def
an_hui
():
...
@@ -4368,7 +4377,7 @@ def an_hui():
...
@@ -4368,7 +4377,7 @@ def an_hui():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4418,7 +4427,7 @@ def an_hui():
...
@@ -4418,7 +4427,7 @@ def an_hui():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
an_hui2
():
def
an_hui2
():
num
=
0
num
=
0
...
@@ -4472,7 +4481,7 @@ def an_hui():
...
@@ -4472,7 +4481,7 @@ def an_hui():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1688'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4524,7 +4533,7 @@ def an_hui():
...
@@ -4524,7 +4533,7 @@ def an_hui():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
an_hui1
()
an_hui1
()
an_hui2
()
an_hui2
()
...
@@ -4607,7 +4616,7 @@ def jiang_xi():
...
@@ -4607,7 +4616,7 @@ def jiang_xi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1689'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1689'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4647,7 +4656,7 @@ def jiang_xi():
...
@@ -4647,7 +4656,7 @@ def jiang_xi():
'organ'
:
organ
,
'organ'
:
organ
,
'topicClassification'
:
topicClassification
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
''
,
'publishDate'
:
None
,
'writtenDate'
:
writtenDate
,
'writtenDate'
:
writtenDate
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
...
@@ -4665,7 +4674,7 @@ def jiang_xi():
...
@@ -4665,7 +4674,7 @@ def jiang_xi():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 河南
# 河南
def
he_nan
():
def
he_nan
():
...
@@ -4711,7 +4720,7 @@ def he_nan():
...
@@ -4711,7 +4720,7 @@ def he_nan():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1690'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1690'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4750,7 +4759,7 @@ def he_nan():
...
@@ -4750,7 +4759,7 @@ def he_nan():
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -4767,7 +4776,7 @@ def he_nan():
...
@@ -4767,7 +4776,7 @@ def he_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 湖南
# 湖南
def
hu_nan
():
def
hu_nan
():
...
@@ -4828,7 +4837,7 @@ def hu_nan():
...
@@ -4828,7 +4837,7 @@ def hu_nan():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1691'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1691'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -4878,7 +4887,7 @@ def hu_nan():
...
@@ -4878,7 +4887,7 @@ def hu_nan():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 甘肃
# 甘肃
def
gan_su
():
def
gan_su
():
...
@@ -4963,7 +4972,7 @@ def gan_su():
...
@@ -4963,7 +4972,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5015,7 +5024,7 @@ def gan_su():
...
@@ -5015,7 +5024,7 @@ def gan_su():
pass
pass
bro
.
quit
()
bro
.
quit
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
def
gan_su2
():
def
gan_su2
():
num
=
0
num
=
0
...
@@ -5097,7 +5106,7 @@ def gan_su():
...
@@ -5097,7 +5106,7 @@ def gan_su():
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
contentWithTag
=
doc
(
'div[id="content"]'
)
contentWithTag
=
doc
(
'div[id="content"]'
)
print
(
title
)
#
print(title)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
try
:
try
:
...
@@ -5119,7 +5128,7 @@ def gan_su():
...
@@ -5119,7 +5128,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
log
.
info
(
f
'{file_name}---{href}--'
)
log
.
info
(
f
'{file_name}---{href}--'
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
...
@@ -5176,7 +5185,7 @@ def gan_su():
...
@@ -5176,7 +5185,7 @@ def gan_su():
pass
pass
bro
.
quit
()
bro
.
quit
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
gan_su3
():
def
gan_su3
():
num
=
0
num
=
0
...
@@ -5260,13 +5269,13 @@ def gan_su():
...
@@ -5260,13 +5269,13 @@ def gan_su():
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
origin
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
pub_hao
=
doc
(
'div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)'
)
.
text
()
contentWithTag
=
doc
(
'div[id="content"]'
)
contentWithTag
=
doc
(
'div[id="content"]'
)
print
(
title
)
#
print(title)
if
len
(
title
)
==
0
or
contentWithTag
.
text
()
==
''
:
if
len
(
title
)
==
0
or
contentWithTag
.
text
()
==
''
:
title
=
doc
(
'div[class="main"]>h1'
)
.
text
()
.
lstrip
()
.
strip
()
title
=
doc
(
'div[class="main"]>h1'
)
.
text
()
.
lstrip
()
.
strip
()
writtenDate
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'日期:'
)[
0
]
.
split
(
' '
)[
0
]
.
lstrip
()
.
strip
()
writtenDate
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'日期:'
)[
0
]
.
split
(
' '
)[
0
]
.
lstrip
()
.
strip
()
origin
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
origin
=
doc
(
'div[class="main"]>div[class="clearbox"]>p:nth-child(1)'
)
.
text
()
.
split
(
'来源:'
)[
0
]
.
lstrip
()
.
strip
()
contentWithTag
=
doc
(
'div[class="detailContent"]'
)
contentWithTag
=
doc
(
'div[class="detailContent"]'
)
print
(
title
)
#
print(title)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
soup
=
paserUrl
(
str
(
contentWithTag
),
href
)
try
:
try
:
...
@@ -5288,7 +5297,7 @@ def gan_su():
...
@@ -5288,7 +5297,7 @@ def gan_su():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1696'
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5304,7 +5313,7 @@ def gan_su():
...
@@ -5304,7 +5313,7 @@ def gan_su():
content
=
soup
.
text
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
print
(
bro
.
page_source
)
#
print(bro.page_source)
continue
continue
if
len
(
content
)
<
2
:
if
len
(
content
)
<
2
:
continue
continue
...
@@ -5345,7 +5354,7 @@ def gan_su():
...
@@ -5345,7 +5354,7 @@ def gan_su():
pass
pass
bro
.
quit
()
bro
.
quit
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
gan_su1
()
gan_su1
()
gan_su2
()
gan_su2
()
...
@@ -5401,7 +5410,7 @@ def ning_xia():
...
@@ -5401,7 +5410,7 @@ def ning_xia():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1697'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1697'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5453,7 +5462,7 @@ def ning_xia():
...
@@ -5453,7 +5462,7 @@ def ning_xia():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 陕西
# 陕西
def
shanxi
():
def
shanxi
():
...
@@ -5511,7 +5520,7 @@ def shanxi():
...
@@ -5511,7 +5520,7 @@ def shanxi():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1680'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1680'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5544,7 +5553,7 @@ def shanxi():
...
@@ -5544,7 +5553,7 @@ def shanxi():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
""
,
'issuedNumber'
:
""
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -5563,7 +5572,7 @@ def shanxi():
...
@@ -5563,7 +5572,7 @@ def shanxi():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 西藏
# 西藏
def
xi_zang
():
def
xi_zang
():
...
@@ -5617,7 +5626,7 @@ def xi_zang():
...
@@ -5617,7 +5626,7 @@ def xi_zang():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1695'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1695'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5647,7 +5656,7 @@ def xi_zang():
...
@@ -5647,7 +5656,7 @@ def xi_zang():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
""
,
'issuedNumber'
:
""
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -5664,7 +5673,7 @@ def xi_zang():
...
@@ -5664,7 +5673,7 @@ def xi_zang():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 青海
# 青海
def
qing_hai
():
def
qing_hai
():
...
@@ -5722,7 +5731,7 @@ def qing_hai():
...
@@ -5722,7 +5731,7 @@ def qing_hai():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5771,7 +5780,7 @@ def qing_hai():
...
@@ -5771,7 +5780,7 @@ def qing_hai():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
qing_hai2
():
def
qing_hai2
():
num
=
0
num
=
0
...
@@ -5849,7 +5858,7 @@ def qing_hai():
...
@@ -5849,7 +5858,7 @@ def qing_hai():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1681'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5899,7 +5908,7 @@ def qing_hai():
...
@@ -5899,7 +5908,7 @@ def qing_hai():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
qing_hai1
()
qing_hai1
()
qing_hai2
()
qing_hai2
()
...
@@ -5943,7 +5952,7 @@ def he_bei():
...
@@ -5943,7 +5952,7 @@ def he_bei():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1668'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1668'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -5987,7 +5996,7 @@ def he_bei():
...
@@ -5987,7 +5996,7 @@ def he_bei():
'topicClassification'
:
""
,
'topicClassification'
:
""
,
'issuedNumber'
:
issuedNumber
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'writtenDate'
:
""
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'summary'
:
''
,
...
@@ -6002,7 +6011,7 @@ def he_bei():
...
@@ -6002,7 +6011,7 @@ def he_bei():
except
:
except
:
pass
pass
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
# 湖北
# 湖北
def
hu_bei
():
def
hu_bei
():
...
@@ -6068,7 +6077,7 @@ def hu_bei():
...
@@ -6068,7 +6077,7 @@ def hu_bei():
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1675'
,
pathType
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1675'
,
pathType
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
...
@@ -6120,44 +6129,45 @@ def hu_bei():
...
@@ -6120,44 +6129,45 @@ def hu_bei():
pass
pass
driver
.
close
()
driver
.
close
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#
get_content1()
get_content1
()
#
get_content2()
get_content2
()
#
get_content3()
get_content3
()
#
bei_jing()
bei_jing
()
#
nei_meng_gu()
nei_meng_gu
()
#
ji_lin()
ji_lin
()
#
shang_hai()
shang_hai
()
#
zhe_jiang()
zhe_jiang
()
#
fu_jian()
fu_jian
()
#
shan_dong()
shan_dong
()
#
guang_dong()
guang_dong
()
#
hai_nan()
hai_nan
()
#
si_chuan()
si_chuan
()
#
guang_xi()
guang_xi
()
#
gui_zhou()
gui_zhou
()
#
yun_nan()
yun_nan
()
#
chong_qing()
chong_qing
()
#
tian_jin()
tian_jin
()
#
xin_jiang()
xin_jiang
()
#
shan_xi()
shan_xi
()
#
liao_ning()
liao_ning
()
#
hei_long_jiang()
hei_long_jiang
()
#
jiang_su()
jiang_su
()
#
an_hui()
an_hui
()
#
jiang_xi()
jiang_xi
()
#
he_nan()
he_nan
()
#
hu_nan()
hu_nan
()
gan_su
()
gan_su
()
# ning_xia()
ning_xia
()
# xi_zang()
xi_zang
()
# shanxi()
shanxi
()
# qing_hai()
qing_hai
()
# he_bei()
he_bei
()
# qing_hai()
qing_hai
()
# current_time = datetime.datetime.now()
current_time
=
datetime
.
datetime
.
now
()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
midnight_time
=
current_time
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
+
datetime
.
timedelta
(
days
=
1
)
# sleep_seconds = (midnight_time - current_time).total_seconds()
sleep_seconds
=
(
midnight_time
-
current_time
)
.
total_seconds
()
# time.sleep(sleep_seconds)
time
.
sleep
(
sleep_seconds
)
comData/policylaw/tingtype.py
浏览文件 @
4d6ca3e2
import
datetime
import
json
import
json
import
random
import
random
import
time
import
time
from
urllib.parse
import
urljoin
from
urllib.parse
import
urljoin
import
datetime
import
pymongo
import
pymongo
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
@@ -12,15 +11,31 @@ import pymysql
...
@@ -12,15 +11,31 @@ import pymysql
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
urllib3
import
urllib3
from
base.BaseCore
import
BaseCore
from
lxml
import
etree
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
cnx
=
baseCore
.
cnx
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
cursor
=
baseCore
.
cursor
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
def
paserUrl
(
html
,
listurl
):
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
links
=
html
.
find_all
([
'a'
,
'img'
])
...
@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
...
@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
def
save_data
(
dic_news
):
def
save_data
(
dic_news
):
aaa_dic
=
{
aaa_dic
=
{
'附件id'
:
dic_news
[
'attachmentIds'
],
'附件id'
:
dic_news
[
'attachmentIds'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
]
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
]
}
}
db_storage
.
insert_one
(
aaa_dic
)
db_storage
.
insert_one
(
aaa_dic
)
def
sendKafka
(
dic_news
):
def
sendKafka
(
dic_news
):
start_time
=
time
.
time
()
start_time
=
time
.
time
()
try
:
#
114.116.116.241
try
:
#
114.116.116.241
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"policy"
,
kafka_result
=
producer
.
send
(
"policy"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
@@ -78,90 +94,121 @@ def sendKafka(dic_news):
...
@@ -78,90 +94,121 @@ def sendKafka(dic_news):
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
def
work
(
href_type
,
ting_type
,
relationId
):
# 国资委_内设机构
ip
=
baseCore
.
get_proxy
()
def
gzw_nsjg
():
log
.
info
(
f
'
\n
================厅局类别==={ting_type}========================'
)
# 获取页面数据
if
'http'
in
href_type
:
def
get_page_nsjg
(
href
,
ting_type
,
relationId
,
page
):
url_type
=
href_type
start_time
=
time
.
time
()
else
:
num
=
0
url_type
=
'http://www.sasac.gov.cn/'
+
href_type
.
replace
(
'../'
,
''
)
for
pageNo
in
range
(
1
,
page
+
1
):
# print(url_type)
if
pageNo
!=
1
:
i_res
=
requests
.
get
(
url
=
url_type
,
headers
=
headers
,
proxies
=
ip
)
href
=
href
.
replace
(
f
'_{pageNo - 1}.html'
,
f
'_{pageNo}.html'
)
i_soup
=
BeautifulSoup
(
i_res
.
content
,
'html.parser'
)
if
pageNo
==
page
:
time
.
sleep
(
2
)
tag
=
href
.
split
(
'/'
)[
-
1
]
news_list
=
i_soup
.
find
(
'div'
,
class_
=
'tjywBottom'
)
.
find_all
(
'li'
)
href
=
href
.
replace
(
tag
,
'index.html'
)
# 文章列表
try
:
# print('================新闻列表==================')
req
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
for
news
in
tqdm
(
news_list
):
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
li_list
=
soup
.
find
(
'ul'
,
attrs
=
{
'class'
:
'ld-tjywList'
})
.
find_all
(
'li'
)
except
:
req
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
soup
=
paserUrl
(
soup
,
href
)
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
try
:
try
:
news_href
=
news
.
find
(
'a'
)[
'href'
]
real_href
=
li
.
find
(
'a'
)
.
get
(
'href'
)
except
:
except
:
continue
continue
if
'http'
in
news_href
:
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
news_url
=
news_href
else
:
news_url
=
'http://www.sasac.gov.cn/'
+
news_href
.
replace
(
'../'
,
''
)
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
news_url
})
if
is_href
:
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
log
.
info
(
'已采集----------跳过'
)
continue
continue
news_title
=
news
.
find
(
'a'
)
.
text
.
split
(
'['
)[
0
]
log
.
info
(
f
'
\n
----正在采集: {news_title}-------'
)
pub_time
=
news
.
find
(
'span'
)
.
text
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# 文章信息
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Cookie'
:
'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Proxy-Connection'
:
'keep-alive'
,
'Referer'
:
'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res
=
requests
.
get
(
url
=
news_url
,
headers
=
header
,
proxies
=
ip
)
ii_soup
=
BeautifulSoup
(
ii_res
.
content
,
'html.parser'
)
# todo:相对路径转化为绝对路径
ii_soup
=
paserUrl
(
ii_soup
,
news_url
)
# 去掉扫一扫
try
:
try
:
ii_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
try
:
try
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
except
:
except
:
pass
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
# 去掉style标签
''
)
.
lstrip
()
.
strip
()
for
styleTag
in
ii_soup
.
find_all
(
'style'
):
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
styleTag
.
extract
()
except
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
time
.
sleep
(
2
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
pub_result
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
real_href
=
str
(
pub_result
.
text
)
.
split
(
'location.href="'
)[
1
]
.
split
(
'";'
)[
0
]
.
lstrip
()
.
strip
()
req_
.
close
()
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
try
:
news_info
=
ii_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
except
Exception
as
e
:
''
)
.
lstrip
()
.
strip
()
log
.
error
(
e
)
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
news_info
=
''
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
if
news_info
:
except
:
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
req_
.
close
()
except
:
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
yaoqiu_list
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'yaoqiu_list'
})
li_list_
=
yaoqiu_list
.
find_all
(
'li'
)
for
li_
in
li_list_
:
href_
=
li_
.
find
(
'a'
)
.
get
(
'href'
)
real_href
=
href_
.
replace
(
'../../../'
,
'http://www.sasac.gov.cn/'
)
req_
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
div_content
=
soup_
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_content'
})
pub_result
=
div_content
.
find
(
'div'
,
attrs
=
{
'class'
:
'zsy_cotitle'
})
try
:
try
:
# origin
title
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
pub_source
=
news_info
.
find
(
'p'
)
.
text
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间'
)[
0
]
.
strip
()
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub_result
.
text
)
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
lstrip
()
.
strip
()
except
:
except
:
title
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
lstrip
()
.
strip
()
publishDate
=
str
(
pub_result
.
text
)
.
split
(
'发布时间:'
)[
1
]
.
strip
()
.
lstrip
()
pub_source
=
''
pub_source
=
''
if
'location.href'
in
title
:
continue
if
'404 Ba'
in
str
(
div_content
):
continue
contentWithTag
=
div_content
.
find
(
'div'
,
class_
=
'zsy_comain'
)
try
:
try
:
contentWithTag
=
ii_soup
.
find
(
'div'
,
'zsy_comain'
)
contentWithTag
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
content
=
contentWithTag
.
text
.
strip
()
except
:
except
:
content
=
''
contentWithTag
=
''
if
len
(
content
)
>
100
:
pass
pass
else
:
# 去掉style标签
for
styleTag
in
contentWithTag
.
find_all
(
'style'
):
styleTag
.
extract
()
content
=
contentWithTag
.
text
if
content
==
''
:
log
.
error
(
f
'{real_href}===获取正文失败'
)
continue
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
dic_news
=
{
'attachmentIds'
:
[],
'attachmentIds'
:
[],
'author'
:
''
,
'author'
:
''
,
...
@@ -175,118 +222,105 @@ def work(href_type,ting_type,relationId):
...
@@ -175,118 +222,105 @@ def work(href_type,ting_type,relationId):
'organ'
:
''
,
'organ'
:
''
,
'topicClassification'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_tim
e
,
'publishDate'
:
publishDat
e
,
'writtenDate'
:
''
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
news_url
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'summary'
:
''
,
'title'
:
news_
title
'title'
:
title
}
}
#print(content)
#print(contentWithTag)
sendKafka
(
dic_news
)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
f
'{ting_type}-----{news_title}----发送成功'
,
)
log
.
info
(
f
'{ting_type}-----{title}----发送成功'
,
)
else
:
dic_error
=
{
'标题'
:
news_title
,
'原文链接'
:
news_url
,
'厅局类别'
:
ting_type
}
log
.
error
(
dic_error
)
#中央纪委国家监委驻国资委纪检监察组
def
job1
(
a_type
):
href
=
a_type
[
'href'
]
ting_type
=
a_type
.
text
return
href
,
ting_type
def
job
():
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
time
.
sleep
(
2
)
# 厅局列表
list_type
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dd'
)[:
22
]
a_soup
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dt'
)[
0
]
a_type
=
a_soup
.
text
.
strip
()
a_href
=
a_soup
.
find
(
'a'
)[
'href'
]
a_id
=
'1874'
list_error
=
[]
num
=
0
start_time
=
time
.
time
()
work
(
a_href
,
a_type
,
a_id
)
for
type
in
tqdm
(
list_type
):
list_news
=
[]
href_type
=
type
.
find
(
'a'
)[
'href'
]
ting_type
=
type
.
find
(
'a'
)
.
text
try
:
relationId
=
mapId_dic
[
ting_type
]
except
:
continue
work
(
href_type
,
ting_type
,
relationId
)
num
+=
1
num
+=
1
except
Exception
as
e
:
pass
req
.
close
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{num}条数据,
共耗时{end_time - start_time}'
)
print
(
f
'抓取{num}条数据,
共耗时{end_time - start_time}'
)
time
.
sleep
(
1
)
# 获取页面列表
# writer.save()
def
get_page_nsjg_list
(
href
,
institution
,
tid
):
# df_error = pd.DataFrame(list_error)
href_list
=
{
# df_error.to_excel('未采到文章.xlsx',index=False)
'办公厅(党委办公厅)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html'
,
9
],
'综合研究局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html'
,
5
],
'政策法规局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html'
,
21
],
'规划发展局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html'
,
9
],
'财务监管与运行评价局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html'
,
9
],
'产权管理局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html'
,
7
],
'企业改革局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html'
,
15
],
'考核分配局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html'
,
6
],
'资本运营与收益管理局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html'
,
3
],
'科技创新局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html'
,
14
],
'社会责任局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html'
,
10
],
'综合监督局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html'
,
1
],
'监督追责局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html'
,
2
],
'企业领导人员管理一局(董事会工作局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html'
,
4
],
'企业领导人员管理二局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html'
,
4
],
'党建工作局(党委组织部、党委统战部)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html'
,
14
],
'宣传工作局(党委宣传部)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html'
,
21
],
'国际合作局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html'
,
28
],
'人事局'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html'
,
7
],
'行业协会商会党建工作局(行业协会商会工作局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html'
,
4
],
'机关服务管理局(离退休干部管理局)'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html'
,
9
],
'机关党委'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html'
,
33
],
'党委巡视工作办公室、国资委巡视组'
:
[
'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html'
,
8
],
'中央纪委国家监委驻国资委纪检监察组'
:
[
'http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html'
,
18
]}
href_
=
href_list
[
institution
][
0
]
page
=
href_list
[
institution
][
1
]
get_page_nsjg
(
href_
,
institution
,
tid
,
page
)
# 开始
def
gzw_nsjg_start
():
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
all_institution
=
[]
tree
=
etree
.
HTML
(
req_text
)
institution
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
institution_href
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href'
)[
0
]
.
replace
(
'../'
,
'http://www.sasac.gov.cn/'
)
all_institution
.
append
([
institution
,
institution_href
])
dd_list
=
tree
.
xpath
(
'/html/body/div[4]/div[2]/div/dl[2]/dd'
)
for
dd
in
dd_list
:
institution
=
dd
.
xpath
(
'./a/text()'
)[
0
]
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
institution_href
=
dd
.
xpath
(
'./a/@href'
)[
0
]
.
replace
(
'../'
,
'http://www.sasac.gov.cn/'
)
all_institution
.
append
([
institution
,
institution_href
])
if
__name__
==
'__main__'
:
tids
=
{
'办公厅(党委办公厅)'
:
1643
,
'综合研究局'
:
1644
,
'政策法规局'
:
1645
,
'规划发展局'
:
1646
,
'财务监管与运行评价局'
:
1647
,
'产权管理局'
:
1648
,
'企业改革局'
:
1649
,
'考核分配局'
:
1650
,
'资本运营与收益管理局'
:
1651
,
'科技创新局'
:
1652
,
'社会责任局'
:
2064
,
'综合监督局'
:
1653
,
'监督追责局'
:
1654
,
'企业领导人员管理一局(董事会工作局)'
:
1655
,
'企业领导人员管理二局'
:
1656
,
'党建工作局(党委组织部、党委统战部)'
:
1657
,
'宣传工作局(党委宣传部)'
:
1658
,
'国际合作局'
:
1659
,
'人事局'
:
1660
,
'行业协会商会党建工作局(行业协会商会工作局)'
:
1661
,
'机关服务管理局(离退休干部管理局)'
:
1662
,
'机关党委'
:
1663
,
'党委巡视工作办公室、国资委巡视组'
:
1664
,
'中央纪委国家监委驻国资委纪检监察组'
:
1874
}
for
a
in
all_institution
:
institution
=
a
[
0
]
href
=
a
[
1
]
tid
=
tids
[
institution
]
log
.
info
(
f
'
\n
================厅局类别==={institution}========================'
)
get_page_nsjg_list
(
href
,
institution
,
tid
)
mapId_dic
=
{
gzw_nsjg_start
()
'办公厅(党委办公厅)'
:
'1643'
,
'综合研究局'
:
'1644'
,
'政策法规局'
:
'1645'
,
'规划发展局'
:
'1646'
,
'财务监管与运行评价局'
:
'1647'
,
'产权管理局'
:
'1648'
,
'企业改革局'
:
'1649'
,
'考核分配局'
:
'1650'
,
'资本运营与收益管理局'
:
'1651'
,
'科技创新局'
:
'1652'
,
'综合监督局'
:
'1653'
,
'监督追责局'
:
'1654'
,
'企业领导人员管理一局(董事会工作局)'
:
'1655'
,
'企业领导人员管理二局'
:
'1656'
,
'党建工作局(党委组织部、党委统战部)'
:
'1657'
,
'宣传工作局(党委宣传部)'
:
'1658'
,
'国际合作局'
:
'1659'
,
'人事局'
:
'1660'
,
'机关服务管理局(离退休干部管理局)'
:
'1662'
,
'机关党委'
:
'1663'
,
'党委巡视工作办公室、国资委巡视组'
:
'1664'
,
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
if
__name__
==
'__main__'
:
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
try
:
try
:
job
()
gzw_nsjg
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
log
.
error
(
e
)
current_time
=
datetime
.
datetime
.
now
()
#
current_time = datetime.datetime.now()
midnight_time
=
current_time
.
replace
(
hour
=
0
,
minute
=
0
,
second
=
0
,
microsecond
=
0
)
+
datetime
.
timedelta
(
days
=
1
)
#
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds
=
(
midnight_time
-
current_time
)
.
total_seconds
()
#
sleep_seconds = (midnight_time - current_time).total_seconds()
time
.
sleep
(
sleep_seconds
)
#
time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
# writer = pd.ExcelWriter('国务院厅局.xlsx')
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论