Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
cf9c7394
提交
cf9c7394
authored
9月 04, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' of
http://114.115.159.144:8090/DingShuangBo/zzsn_spider
上级
7fc3194e
a9ad4913
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
95 行增加
和
139 行删除
+95
-139
2.py
comData/policylaw/2.py
+95
-139
没有找到文件。
comData/policylaw/2.py
浏览文件 @
cf9c7394
...
@@ -33,7 +33,7 @@ taskType = '政策法规'
...
@@ -33,7 +33,7 @@ taskType = '政策法规'
各地方国资委
各地方国资委
"""
"""
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委'
]
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'zzsn@9988'
)
.
caiji
[
'国务院_国资委
_copy1
'
]
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
...
@@ -52,86 +52,17 @@ def paserUrl(html,listurl):
...
@@ -52,86 +52,17 @@ def paserUrl(html,listurl):
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
return
html
def
replaceUrl
(
hostUrl
,
src
):
if
'../'
in
src
:
src
=
src
.
strip
(
'../'
)
if
'./'
in
src
:
src
=
src
.
strip
(
'.'
)
finnal_href
=
hostUrl
+
src
return
finnal_href
def
save_data
(
result_dict
):
def
save_data
(
dic_news
):
try
:
aaa_dic
=
{
aa
=
result_dict
[
'信息来源'
]
a_dict
=
result_dict
'附件id'
:
dic_news
[
'attachmentIds'
],
except
:
'网址'
:
dic_news
[
'sourceAddress'
],
try
:
'tid'
:
dic_news
[
'labels'
][
0
][
'relationId'
],
tid
=
result_dict
[
'tid'
]
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
except
:
'创建时间'
:
dic_news
[
'createDate'
]
tid
=
'1666'
}
pass
db_storage
.
insert_one
(
aaa_dic
)
a_dict
=
{
'标题'
:
result_dict
[
'标题'
],
'来源'
:
result_dict
[
'来源'
],
'发文机关'
:
''
,
'发文字号'
:
result_dict
[
'号'
],
'内容-未去标签'
:
result_dict
[
'内容'
],
'附件网址'
:
result_dict
[
'附件网址'
],
'发布时间'
:
result_dict
[
'发布时间'
],
'成文时间'
:
''
,
'主题分类'
:
''
,
'网址'
:
result_dict
[
'网址'
],
'归属'
:
result_dict
[
'归属'
],
'信息来源'
:
'地方国资委'
,
'tid'
:
tid
,
}
# a_dict['内容-未去标签'] = a_dict['内容-未去标签'].split('扫一扫在手机打开')[0]
#
if
a_dict
[
'标题'
]:
pass
else
:
return
try
:
post_url
=
'http://39.105.62.235:1820/ExtarctLawInfo'
headers_
=
{
'Content-Type'
:
'application/json'
}
resp
=
requests
.
post
(
post_url
,
headers
=
headers_
,
verify
=
False
,
data
=
json
.
dumps
(
a_dict
))
if
resp
.
status_code
==
500
:
try
:
tid
=
result_dict
[
'tid'
]
except
:
tid
=
'1666'
a_dict
=
{
'标题'
:
result_dict
[
'标题'
],
'来源'
:
result_dict
[
'来源'
],
'发文机关'
:
''
,
'发文字号'
:
result_dict
[
'号'
],
'内容-未去标签'
:
'--'
,
'附件网址'
:
result_dict
[
'附件网址'
],
'发布时间'
:
result_dict
[
'发布时间'
],
'成文时间'
:
''
,
'主题分类'
:
''
,
'网址'
:
result_dict
[
'网址'
],
'归属'
:
result_dict
[
'归属'
],
'信息来源'
:
'地方国资委'
,
'tid'
:
tid
,
}
resp
=
requests
.
post
(
post_url
,
headers
=
headers_
,
verify
=
False
,
data
=
json
.
dumps
(
a_dict
))
print
(
'推送:'
,
resp
.
status_code
)
if
resp
.
status_code
!=
200
:
print
(
'推送失败!'
)
time
.
sleep
(
10
)
a_dict
[
'is_send'
]
=
''
db_storage
.
insert_one
(
a_dict
)
return
except
:
print
(
'推送失败!'
)
time
.
sleep
(
10
)
a_dict
[
'is_send'
]
=
''
db_storage
.
insert_one
(
a_dict
)
return
db_storage
.
insert_one
(
a_dict
)
def
sendKafka
(
dic_news
):
def
sendKafka
(
dic_news
):
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -475,7 +406,7 @@ from urllib.parse import urljoin
...
@@ -475,7 +406,7 @@ from urllib.parse import urljoin
# 北京
# 北京
def
bei_jing
():
def
bei_jing
():
id_list
=
[]
num
=
0
num
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 有反爬需要使用selenium
# 有反爬需要使用selenium
...
@@ -521,37 +452,56 @@ def bei_jing():
...
@@ -521,37 +452,56 @@ def bei_jing():
break
break
updown
.
click
()
updown
.
click
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
for
href
in
hrefs
[
4
:
6
]:
log
.
info
(
f
'------{len(hrefs)}条数据-------------'
)
num
=
0
for
href
in
hrefs
:
id_list
=
[]
title
=
href
[
1
]
title
=
href
[
1
]
#todo:测试需要 注释掉判重
#todo:测试需要 注释掉判重
# 判断是否已经爬取过
# 判断是否已经爬取过
# is_href = db_storage.find_one({'网址': href[0]})
is_href
=
db_storage
.
find_one
({
'网址'
:
href
[
0
]})
# if is_href:
if
is_href
:
# continue
log
.
info
(
'已采集----------跳过'
)
continue
# 对获取信息页面发送请求
# 对获取信息页面发送请求
bro
.
get
(
href
[
0
])
bro
.
get
(
href
[
0
])
time
.
sleep
(
1
)
time
.
sleep
(
1
)
# 获取所要信息
# 获取所要信息
pub
=
bro
.
find_element
(
By
.
CLASS_NAME
,
'doc-info'
)
pub
=
bro
.
find_element
(
By
.
CLASS_NAME
,
'doc-info'
)
topic
=
str
(
pub
.
text
)
.
split
(
'[主题分类] '
)[
1
]
.
split
(
'
\n
'
)[
0
]
.
strip
()
#发文机构
organ
=
str
(
pub
.
text
)
.
split
(
'[发文机构] '
)[
1
]
.
split
(
'
\n
'
)[
0
]
.
strip
()
pub_time
=
str
(
pub
.
text
)
.
split
(
'[发布日期] '
)[
1
]
.
split
(
'[有效性] '
)[
0
]
.
strip
()
.
lstrip
()
pub_time
=
str
(
pub
.
text
)
.
split
(
'[发布日期] '
)[
1
]
.
split
(
'[有效性] '
)[
0
]
.
strip
()
.
lstrip
()
pub_source
=
str
(
pub
.
text
)
.
split
(
'[发文机构] '
)[
1
]
.
split
(
'[联合发文单位] '
)[
0
]
.
split
(
'[实施日期] '
)[
0
]
.
strip
()
.
lstrip
()
writtenDate
=
str
(
pub
.
text
)
.
split
(
'[成文日期] '
)[
1
]
.
split
(
'
\n
'
)[
0
]
.
strip
()
# pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
pub_hao
=
pub
.
find_element
(
By
.
CLASS_NAME
,
'fwzh'
)
.
text
.
replace
(
'[发文字号] '
,
''
)
.
lstrip
()
.
strip
()
pub_hao
=
pub
.
find_element
(
By
.
CLASS_NAME
,
'fwzh'
)
.
text
.
replace
(
'[发文字号] '
,
''
)
.
lstrip
()
.
strip
()
try
:
pub_list
=
bro
.
find_elements
(
By
.
CLASS_NAME
,
'article-info'
)
for
source
in
pub_list
:
if
'来源'
in
source
.
text
:
pub_source
=
source
.
text
.
split
(
'来源:'
)[
1
]
.
split
(
'
\n
'
)[
0
]
# print(pub_source)
except
:
pub_source
=
''
#.split('来源:')[1]
if
'号'
not
in
pub_hao
:
if
'号'
not
in
pub_hao
:
pub_hao
=
''
pub_hao
=
''
cont
=
bro
.
find_element
(
By
.
ID
,
'div_zhengwen'
)
.
get_attribute
(
'innerHTML'
)
cont
=
bro
.
find_element
(
By
.
ID
,
'div_zhengwen'
)
.
get_attribute
(
'innerHTML'
)
soup_cont
=
BeautifulSoup
(
cont
,
'lxml'
)
soup_cont
=
BeautifulSoup
(
cont
,
'lxml'
)
soup
=
paserUrl
(
soup_cont
,
href
[
0
])
soup
=
paserUrl
(
soup_cont
,
href
[
0
])
text
=
str
(
soup
.
prettify
())
text
=
str
(
soup
.
prettify
())
print
(
text
)
#todo:去掉扫一扫
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
# print(title)
# print(title)
num
=
0
fu_jian_soup
=
soup
.
find_all
(
'a'
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
for
file
in
fu_jian_soup
:
num
+=
1
try
:
file_href
=
file
[
'href'
]
file_href
=
file
[
'href'
]
except
Exception
as
e
:
log
.
info
(
f
'---{href[0]}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
...
@@ -567,45 +517,46 @@ def bei_jing():
...
@@ -567,45 +517,46 @@ def bei_jing():
#todo:将返回的地址更新到soup
#todo:将返回的地址更新到soup
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
file
[
'href'
]
=
'http://114.115.215.96/'
+
full_path
id_
=
redefid
(
id_list
)
#
id_ = redefid(id_list)
#todo:替换完成之后,将附件上传至文件服务器
#todo:替换完成之后,将附件上传至文件服务器
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
'attachmentIds'
:
id_
,
'attachmentIds'
:
id_
list
,
'author'
:
''
,
'author'
:
''
,
'content'
:
str
(
soup
_cont
.
text
),
'content'
:
str
(
soup
.
text
),
'contentWithTag'
:
str
(
soup
_cont
),
'contentWithTag'
:
str
(
soup
),
'createDate'
:
time_now
,
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'id'
:
''
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1667"
,
'relationName'
:
"北京市国资委"
,
'labelMark'
:
"policy"
}],
'labels'
:
[{
'relationId'
:
"1667"
,
'relationName'
:
"北京市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_source
,
'origin'
:
pub_source
,
'organ'
:
pub_hao
,
'organ'
:
organ
,
'topicClassification'
:
''
,
'topicClassification'
:
topic
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
pub_tim
e
,
'writtenDate'
:
writtenDat
e
,
'sid'
:
'1697458829758697473'
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
''
,
'sourceAddress'
:
href
[
0
]
,
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
print
(
dic_news
)
# print(dic_news)
# sendKafka(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
# print(id)
# print(id)
# id_list.append(id)
# id_list.append(id)
num
+=
1
num
+=
1
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
bro
.
quit
()
bro
.
quit
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
log
.
info
(
e
)
pass
pass
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,共耗时{end_time - start_time}'
)
# 内蒙古
# 内蒙古
def
nei_meng_gu
():
def
nei_meng_gu
():
id_list
=
[]
start
=
time
.
time
()
start
=
time
.
time
()
num
=
0
num
=
0
url
=
'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
url
=
'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
...
@@ -617,6 +568,7 @@ def nei_meng_gu():
...
@@ -617,6 +568,7 @@ def nei_meng_gu():
result
=
soup
.
find
(
class_
=
'right_two'
)
result
=
soup
.
find
(
class_
=
'right_two'
)
li_list
=
result
.
find_all
(
class_
=
'font14wr'
)
li_list
=
result
.
find_all
(
class_
=
'font14wr'
)
for
a
in
li_list
[:
1
]:
for
a
in
li_list
[:
1
]:
id_list
=
[]
a_text
=
str
(
a
)
a_text
=
str
(
a
)
real_href
=
'https://gzw.nmg.gov.cn/zfxxgk'
+
a_text
.
split
(
'href="..'
)[
-
1
]
.
split
(
'" target="_blank'
)[
0
]
real_href
=
'https://gzw.nmg.gov.cn/zfxxgk'
+
a_text
.
split
(
'href="..'
)[
-
1
]
.
split
(
'" target="_blank'
)[
0
]
# # 判断是否已经爬取过
# # 判断是否已经爬取过
...
@@ -631,13 +583,19 @@ def nei_meng_gu():
...
@@ -631,13 +583,19 @@ def nei_meng_gu():
href_text
.
encoding
=
'utf-8'
href_text
.
encoding
=
'utf-8'
i_html
=
href_text
.
text
i_html
=
href_text
.
text
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
#todo:将html中的a标签相对路径改为绝对路径
i_soup
=
paserUrl
(
i_soup
,
real_href
)
i_result
=
i_soup
.
find
(
'div'
,
id
=
'd_laiyuan'
)
i_result
=
i_soup
.
find
(
'div'
,
id
=
'd_laiyuan'
)
time_
=
i_result
.
find_all
(
'span'
)[
0
]
time_
=
i_result
.
find_all
(
'span'
)[
0
]
time_
=
str
(
time_
)
time_
=
str
(
time_
)
pub_time
=
time_
.
split
(
'<span>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
replace
(
'发布时间:'
,
''
)
pub_time
=
time_
.
split
(
'<span>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
replace
(
'发布时间:'
,
''
)
source
=
i_result
.
find_all
(
'span'
)[
1
]
#发布机关
source
=
str
(
source
)
origin
=
i_result
.
find_all
(
'span'
)[
1
]
pub_source
=
source
.
split
(
'<span>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
replace
(
'来源:'
,
''
)
origin
=
str
(
origin
)
pub_source
=
origin
.
split
(
'<span>'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
replace
(
'来源:'
,
''
)
#发文机关
organ
=
origin
fwzh
=
i_soup
.
find_all
(
'td'
)[
7
]
fwzh
=
i_soup
.
find_all
(
'td'
)[
7
]
pub_hao_result
=
re
.
findall
(
'〔(.*?)〕'
,
str
(
fwzh
))
pub_hao_result
=
re
.
findall
(
'〔(.*?)〕'
,
str
(
fwzh
))
if
len
(
pub_hao_result
)
==
0
:
if
len
(
pub_hao_result
)
==
0
:
...
@@ -647,16 +605,19 @@ def nei_meng_gu():
...
@@ -647,16 +605,19 @@ def nei_meng_gu():
pub_hao
=
str
(
fwzh
)
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
pub_hao
=
str
(
fwzh
)
.
split
(
'<td>'
)[
1
]
.
split
(
'</td>'
)[
0
]
else
:
else
:
pub_hao
=
''
pub_hao
=
''
#成文时间
writtenDate
=
i_soup
.
find_all
(
'td'
)[
9
]
.
text
topicClassification
=
i_soup
.
find_all
(
'td'
)[
3
]
.
text
i_content
=
str
(
i_soup
.
find
(
class_
=
'd_show'
))
i_content
=
str
(
i_soup
.
find
(
class_
=
'd_show'
))
if
i_content
:
if
i_content
:
content
=
i_content
content
=
i_content
else
:
else
:
i_content
=
str
(
i_soup
.
find
(
class_
=
'view TRS_UEDITOR trs_paper_default'
))
i_content
=
str
(
i_soup
.
find
(
class_
=
'view TRS_UEDITOR trs_paper_default'
))
content
=
i_content
content
=
i_content
#todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian
=
i_soup
.
find
_all
(
class_
=
'ql_detailbro_right_qztp
'
)
fujian
=
i_soup
.
find
(
class_
=
'xy_zcwjxl_downloadPC_list
'
)
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
str
(
fujian
))
fu_jian_result
=
re
.
findall
(
'href="(.*?)"'
,
str
(
fujian
))
fu_jian_href_list
=
[
]
# fu_jian_result = fujian.find('a')['href'
]
if
len
(
fu_jian_result
)
>
0
:
if
len
(
fu_jian_result
)
>
0
:
for
fu_jian_re
in
fu_jian_result
:
for
fu_jian_re
in
fu_jian_result
:
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
if
'.doc'
in
fu_jian_re
or
'.pdf'
in
fu_jian_re
or
'.xls'
in
fu_jian_re
or
'.zip'
in
fu_jian_re
\
...
@@ -664,58 +625,53 @@ def nei_meng_gu():
...
@@ -664,58 +625,53 @@ def nei_meng_gu():
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
or
'.XLS'
in
fu_jian_re
or
'.ZIP'
in
fu_jian_re
or
'.RAR'
in
fu_jian_re
:
fu_jian_re
=
str
(
real_href
)
.
split
(
'/t'
)[
0
]
+
'/'
+
str
(
fu_jian_re
)
.
split
(
'./'
)[
1
]
fu_jian_re
=
str
(
real_href
)
.
split
(
'/t'
)[
0
]
+
'/'
+
str
(
fu_jian_re
)
.
split
(
'./'
)[
1
]
fu_jian_href
=
fu_jian_re
fu_jian_href
=
fu_jian_re
fu_jian_href_list
.
append
(
fu_jian_href
)
#todo:附件上传至文件服务器
#todo:附件需要上传文件服务器 type_id:7
retData
=
baseCore
.
uploadToserver
(
fu_jian_href
,
'1669'
)
if
retData
[
'state'
]:
result_dict
=
{
pass
'标题'
:
title
,
else
:
'来源'
:
pub_source
,
continue
'号'
:
pub_hao
,
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'内蒙古自治区国资委'
,
title
,
num
)
'内容'
:
content
,
id_list
.
append
(
att_id
)
'附件网址'
:
fu_jian_href_list
,
# # todo:将返回的地址更新到soup
'发布时间'
:
pub_time
,
# fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
'网址'
:
real_href
,
'归属'
:
'内蒙古自治区国资委'
,
}
print
(
title
)
print
(
title
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
id
=
baseCore
.
getNextSeq
()
id
=
baseCore
.
getNextSeq
()
# todo:传kafka字段
# todo:传kafka字段
dic_news
=
{
dic_news
=
{
'attachmentIds'
:
"14,15,16"
,
'attachmentIds'
:
id_list
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
content
,
'contentWithTag'
:
content
,
'createDate'
:
time_now
,
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'id'
:
id
,
'id'
:
''
,
'labels'
:[{
'relationId'
:
"1669"
,
'relationName'
:
"内蒙古自治区国资委"
,
'labelMark'
:
"policy"
}],
'labels'
:[{
'relationId'
:
"1669"
,
'relationName'
:
"内蒙古自治区国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_source
,
'origin'
:
origin
,
'organ'
:
pub_hao
,
'organ'
:
organ
,
'topicClassification'
:
''
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
pub_hao
,
'issuedNumber'
:
pub_hao
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'writtenDate'
:
pub_tim
e
,
'writtenDate'
:
writtenDat
e
,
'sid'
:
'
0987654321
'
,
'sid'
:
'
1697458829758697473
'
,
'sourceAddress'
:
''
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
'title'
:
title
}
}
sendKafka
(
dic_news
)
sendKafka
(
dic_news
)
print
(
id
)
id_list
.
append
(
id
)
save_data
(
dic_news
)
# save_data(result_dict)
num
=
num
+
1
num
=
num
+
1
break
break
except
:
except
:
pass
pass
except
:
except
:
pass
pass
print
(
id_list
)
end
=
time
.
time
()
end
=
time
.
time
()
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
print
(
'共'
,
num
,
'条'
,
'...........'
,
'共耗时'
,
end
-
start
,
'秒'
)
# 吉林
# 吉林
def
ji_lin
():
def
ji_lin
():
start
=
time
.
time
()
start
=
time
.
time
()
...
@@ -3950,9 +3906,9 @@ if __name__ == '__main__':
...
@@ -3950,9 +3906,9 @@ if __name__ == '__main__':
# get_content1()
# get_content1()
# get_content2()
# get_content2()
# get_content3()
# get_content3()
bei_jing
()
#
bei_jing()
# nei_meng_gu()
# nei_meng_gu()
#
ji_lin()
ji_lin
()
# shang_hai()
# shang_hai()
# zhe_jiang()
# zhe_jiang()
# fu_jian()
# fu_jian()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论