Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
c9546130
提交
c9546130
authored
9月 09, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规最终版
上级
eeb41ef7
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
230 行增加
和
62 行删除
+230
-62
2.py
comData/policylaw/2.py
+0
-0
厅局.py
comData/policylaw/厅局.py
+230
-62
没有找到文件。
comData/policylaw/2.py
浏览文件 @
c9546130
This source diff could not be displayed because it is too large. You can
view the blob
instead.
comData/policylaw/厅局.py
浏览文件 @
c9546130
import
random
import
json
import
json
import
random
import
random
import
time
import
time
from
urllib.parse
import
urljoin
import
pymongo
from
kafka
import
KafkaProducer
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
pandas
as
pd
import
pandas
as
pd
import
pymysql
import
pymysql
...
@@ -12,47 +17,80 @@ log = baseCore.getLogger()
...
@@ -12,47 +17,80 @@ log = baseCore.getLogger()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
cnx
=
baseCore
.
cnx
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
cursor
=
baseCore
.
cursor
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委_copy1'
]
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
def
save_data
(
dic_news
):
aaa_dic
=
{
'附件id'
:
dic_news
[
'attachmentIds'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
]
}
db_storage
.
insert_one
(
aaa_dic
)
def
sendKafka
(
dic_news
):
start_time
=
time
.
time
()
try
:
#114.116.116.241
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"policy"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
headers
=
{
dic_result
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'success'
:
'ture'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'message'
:
'操作成功'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'code'
:
'200'
,
'Cache-Control'
:
'no-cache'
,
}
'Connection'
:
'keep-alive'
,
log
.
info
(
dic_result
)
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
# 传输成功,写入日志中
'Host'
:
'www.sasac.gov.cn'
,
state
=
1
'Pragma'
:
'no-cache'
,
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
# return True
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
except
Exception
as
e
:
}
# 创建一个ExcelWriter对象
dic_result
=
{
writer
=
pd
.
ExcelWriter
(
'国务院厅局.xlsx'
)
'success'
:
'false'
,
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
'message'
:
'操作失败'
,
ip
=
baseCore
.
get_proxy
()
'code'
:
'204'
,
res
=
requests
.
get
(
url
,
headers
,
proxies
=
ip
)
'e'
:
e
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
}
time
.
sleep
(
2
)
log
.
error
(
dic_result
)
#厅局列表
e
=
'Kafka操作失败'
list_type
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dd'
)
state
=
0
list_error
=
[]
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
for
type
in
tqdm
(
list_type
[:
2
]):
list_news
=
[]
def
work
(
href_type
,
ting_type
,
relationId
):
href_type
=
type
.
find
(
'a'
)[
'href'
]
ip
=
baseCore
.
get_proxy
()
ting_type
=
type
.
find
(
'a'
)
.
text
log
.
info
(
f
'
\n
================厅局类别==={ting_type}========================'
)
print
(
f
'
\n
================厅局类别==={ting_type}========================'
)
if
'http'
in
href_type
:
if
'http'
in
href_type
:
url_type
=
href_type
url_type
=
href_type
else
:
else
:
url_type
=
'http://www.sasac.gov.cn/'
+
href_type
.
replace
(
'../'
,
''
)
url_type
=
'http://www.sasac.gov.cn/'
+
href_type
.
replace
(
'../'
,
''
)
# print(url_type)
# print(url_type)
i_res
=
requests
.
get
(
url
_type
,
headers
)
i_res
=
requests
.
get
(
url
=
url_type
,
headers
=
headers
,
proxies
=
ip
)
i_soup
=
BeautifulSoup
(
i_res
.
content
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
i_res
.
content
,
'html.parser'
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
news_list
=
i_soup
.
find
(
'div'
,
class_
=
'tjywBottom'
)
.
find_all
(
'li'
)
news_list
=
i_soup
.
find
(
'div'
,
class_
=
'tjywBottom'
)
.
find_all
(
'li'
)
#文章列表
#
文章列表
# print('================新闻列表==================')
# print('================新闻列表==================')
for
news
in
tqdm
(
news_list
[:
2
]
):
for
news
in
tqdm
(
news_list
):
try
:
try
:
news_href
=
news
.
find
(
'a'
)[
'href'
]
news_href
=
news
.
find
(
'a'
)[
'href'
]
except
:
except
:
...
@@ -60,55 +98,185 @@ for type in tqdm(list_type[:2]):
...
@@ -60,55 +98,185 @@ for type in tqdm(list_type[:2]):
if
'http'
in
news_href
:
if
'http'
in
news_href
:
news_url
=
news_href
news_url
=
news_href
else
:
else
:
news_url
=
'http://www.sasac.gov.cn/'
+
news_href
.
replace
(
'../'
,
''
)
news_url
=
'http://www.sasac.gov.cn/'
+
news_href
.
replace
(
'../'
,
''
)
# 判断是否已经爬取过
is_href
=
db_storage
.
find_one
({
'网址'
:
news_url
})
if
is_href
:
log
.
info
(
'已采集----------跳过'
)
continue
news_title
=
news
.
find
(
'a'
)
.
text
.
split
(
'['
)[
0
]
news_title
=
news
.
find
(
'a'
)
.
text
.
split
(
'['
)[
0
]
print
(
f
'
\n
----正在采集: {news_title}-------'
)
log
.
info
(
f
'
\n
----正在采集: {news_title}-------'
)
pub_time
=
news
.
find
(
'span'
)
.
text
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
pub_time
=
news
.
find
(
'span'
)
.
text
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
#文章信息
# 文章信息
ii_res
=
requests
.
get
(
news_url
,
headers
)
header
=
{
ii_soup
=
BeautifulSoup
(
ii_res
.
content
,
'html.parser'
)
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Cookie'
:
'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Proxy-Connection'
:
'keep-alive'
,
'Referer'
:
'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res
=
requests
.
get
(
url
=
news_url
,
headers
=
header
,
proxies
=
ip
)
ii_soup
=
BeautifulSoup
(
ii_res
.
content
,
'html.parser'
)
# todo:相对路径转化为绝对路径
# todo:相对路径转化为绝对路径
ii_soup
=
paserUrl
(
ii_soup
,
news_url
)
# 去掉扫一扫
try
:
ii_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
# 去掉style标签
for
styleTag
in
ii_soup
.
find_all
(
'style'
):
styleTag
.
extract
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
try
:
try
:
news_info
=
ii_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
news_info
=
ii_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
log
.
error
(
e
)
news_info
=
''
news_info
=
''
if
news_info
:
if
news_info
:
try
:
try
:
pub_source
=
news_info
.
find
(
'p'
)
.
text
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间'
)[
0
]
# origin
pub_source
=
news_info
.
find
(
'p'
)
.
text
.
split
(
'文章来源:'
)[
1
]
.
split
(
'发布时间'
)[
0
]
.
strip
()
except
:
except
:
pub_source
=
''
pub_source
=
''
try
:
try
:
content
=
ii_soup
.
find
(
'div'
,
'zsy_comain'
)
.
text
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
.
strip
()
contentWithTag
=
ii_soup
.
find
(
'div'
,
'zsy_comain'
)
content
=
contentWithTag
.
text
.
strip
()
except
:
except
:
content
=
''
content
=
''
# print(news_url)
contentWithTag
=
''
if
len
(
content
)
>
100
:
pass
else
:
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
dic_news
=
{
'标题'
:
news_title
,
'attachmentIds'
:
[],
'发布时间'
:
pub_time
,
'author'
:
''
,
'来源'
:
pub_source
,
# 'content': content,
'内容'
:
content
,
# 'contentWithTag': str(contentWithTag),
'原文链接'
:
news_url
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
relationId
,
'relationName'
:
ting_type
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_source
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
''
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
news_url
,
'summary'
:
''
,
'title'
:
news_title
}
}
list_news
.
append
(
dic_news
)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
log
.
info
(
f
'{ting_type}-----{news_title}----发送成功'
,
)
else
:
else
:
dic_error
=
{
dic_error
=
{
'标题'
:
news_title
,
'标题'
:
news_title
,
'原文链接'
:
news_url
,
'原文链接'
:
news_url
,
'厅局类别'
:
ting_type
'厅局类别'
:
ting_type
}
}
list_error
.
append
(
dic_error
)
log
.
error
(
dic_error
)
#中央纪委国家监委驻国资委纪检监察组
def
job1
(
a_type
):
href
=
a_type
[
'href'
]
ting_type
=
a_type
.
text
return
href
,
ting_type
def
job
():
url
=
'http://www.sasac.gov.cn/n2588020/index.html'
ip
=
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
proxies
=
ip
)
soup
=
BeautifulSoup
(
res
.
content
,
'html.parser'
)
time
.
sleep
(
2
)
# 厅局列表
list_type
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dd'
)[:
22
]
a_soup
=
soup
.
find
(
'div'
,
class_
=
'l-jgkk-right column'
)
.
find_all
(
'dt'
)[
0
]
a_type
=
a_soup
.
text
.
strip
()
a_href
=
a_soup
.
find
(
'a'
)[
'href'
]
a_id
=
'1874'
list_error
=
[]
num
=
0
start_time
=
time
.
time
()
work
(
a_href
,
a_type
,
a_id
)
for
type
in
tqdm
(
list_type
):
list_news
=
[]
href_type
=
type
.
find
(
'a'
)[
'href'
]
ting_type
=
type
.
find
(
'a'
)
.
text
relationId
=
mapId_dic
[
ting_type
]
work
(
href_type
,
ting_type
,
relationId
)
df
=
pd
.
DataFrame
(
list_news
)
num
+=
1
# 将数据写入不同的sheet页
end_time
=
time
.
time
()
df
.
to_excel
(
writer
,
sheet_name
=
ting_type
,
index
=
False
)
log
.
info
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
print
(
f
'=============当前sheet页{ting_type}---数据总数:{len(df)}================'
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
writer
.
save
()
# writer.save()
df_error
=
pd
.
DataFrame
(
list_error
)
# df_error = pd.DataFrame(list_error)
df_error
.
to_excel
(
'未采到文章.xlsx'
,
index
=
False
)
# df_error.to_excel('未采到文章.xlsx',index=False)
if
__name__
==
'__main__'
:
mapId_dic
=
{
'办公厅(党委办公厅)'
:
'1643'
,
'综合研究局'
:
'1644'
,
'政策法规局'
:
'1645'
,
'规划发展局'
:
'1646'
,
'财务监管与运行评价局'
:
'1647'
,
'产权管理局'
:
'1648'
,
'企业改革局'
:
'1649'
,
'考核分配局'
:
'1650'
,
'资本运营与收益管理局'
:
'1651'
,
'科技创新和社会责任局'
:
'1652'
,
'综合监督局'
:
'1653'
,
'监督追责局'
:
'1654'
,
'企业领导人员管理一局(董事会工作局)'
:
'1655'
,
'企业领导人员管理二局'
:
'1656'
,
'党建工作局(党委组织部、党委统战部)'
:
'1657'
,
'宣传工作局(党委宣传部)'
:
'1658'
,
'国际合作局'
:
'1659'
,
'人事局'
:
'1660'
,
'机关服务管理局(离退休干部管理局)'
:
'1662'
,
'机关党委'
:
'1663'
,
'党委巡视工作办公室、国资委巡视组'
:
'1664'
,
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725'
,
'Host'
:
'www.sasac.gov.cn'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
try
:
job
()
except
Exception
as
e
:
print
(
e
)
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论