Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
d7b3c3cf
提交
d7b3c3cf
authored
12月 02, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
REITs专题 12/02
上级
7ef6f432
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
597 行增加
和
14 行删除
+597
-14
LawRules-2-shenzhen.py
REITs专题数据/LawRules-2-shenzhen.py
+55
-0
RuleGuide-shenzhen.py
REITs专题数据/RuleGuide-shenzhen.py
+108
-0
policy-zhejiang.py
REITs专题数据/policy-zhejiang.py
+434
-14
没有找到文件。
REITs专题数据/LawRules-2-shenzhen.py
0 → 100644
浏览文件 @
d7b3c3cf
import
re
import
re
import
requests
from
bs4
import
BeautifulSoup
from
base
import
BaseCore
from
retry
import
retry
baseCore
=
BaseCore
.
BaseCore
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def
getContentA
(
url
):
pass
def
getContentB
(
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'desContent'
})
a_list
=
contentWithTag
.
find_all
(
'a'
)
for
a
in
a_list
:
href
=
a
.
get
(
'href'
)
file_name
=
a
.
text
.
strip
()
content
=
contentWithTag
.
text
.
strip
()
def
doJob
():
urls
=
[
'http://reits.szse.cn/lawrule/laws/index.html'
,
'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html'
,
'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html'
]
for
url
in
urls
:
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'lxml'
)
li_list
=
soup
.
find
(
'ul'
,
class_
=
'newslist'
)
.
find_all
(
'li'
)
for
li
in
li_list
:
info
=
str
(
li
.
find
(
'script'
))
href
=
re
.
findall
(
'curHref =
\'
(.*?)
\'
;'
,
info
)[
0
]
.
replace
(
'./'
,
'http://reits.szse.cn/lawrule/laws/'
)
title
=
re
.
findall
(
'curTitle =
\'
(.*?)
\'
;'
,
info
)[
0
]
publishDate
=
li
.
find
(
'span'
,
class_
=
'time'
)
.
text
.
strip
()
if
'.html'
in
href
:
getContentA
(
href
)
else
:
getContentB
(
href
)
if
__name__
==
'__main__'
:
doJob
()
REITs专题数据/RuleGuide-shenzhen.py
0 → 100644
浏览文件 @
d7b3c3cf
import
os
import
os
import
re
import
time
import
requests
from
bs4
import
BeautifulSoup
import
BaseCore
from
reits
import
Policy
policy
=
Policy
()
topic
=
'policy'
webname
=
'深圳证券交易所REITs'
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def
getContent
(
url
,
publishDate
,
num
,
id_list
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'desContent'
})
pub_hao
=
contentWithTag
.
find
(
'p'
)
.
text
.
strip
()
if
pub_hao
==
''
:
pub_hao
=
contentWithTag
.
find_all
(
'p'
)[
1
]
.
text
.
strip
()
if
'号'
not
in
pub_hao
:
pub_hao
=
''
a_list
=
contentWithTag
.
find_all
(
'a'
)
for
a
in
a_list
:
fj_href
=
a
.
get
(
'href'
)
if
not
fj_href
:
continue
fj_title
=
a
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
fj_href
)[
1
]
if
'.'
not
in
category
or
'.cn'
in
category
:
continue
if
category
not
in
fj_title
:
fj_title
=
fj_title
+
category
# 上传附件至obs
att_id
,
full_path
=
policy
.
attuributefile
(
fj_title
,
fj_href
,
num
,
publishDate
)
if
att_id
:
id_list
.
append
(
att_id
)
a
[
'href'
]
=
full_path
content
=
contentWithTag
.
text
.
strip
()
return
pub_hao
,
content
,
id_list
,
str
(
contentWithTag
)
def
doJob
():
urls
=
[
'http://reits.szse.cn/lawrule/bussrules/latest/index.html'
,
'http://reits.szse.cn/lawrule/bussrules/supervise/index.html'
]
num
=
1
for
url
in
urls
:
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'lxml'
)
li_list
=
soup
.
find
(
'ul'
,
class_
=
'newslist'
)
.
find_all
(
'li'
)
for
li
in
li_list
:
id_list
=
[]
info
=
str
(
li
.
find
(
'script'
))
href
=
re
.
findall
(
'curHref =
\'
(.*?)
\'
;'
,
info
)[
0
]
.
replace
(
'./'
,
url
.
replace
(
url
.
split
(
'/'
)[
-
1
],
''
))
title
=
re
.
findall
(
'curTitle =
\'
(.*?)
\'
;'
,
info
)[
0
]
publishDate
=
li
.
find
(
'span'
,
class_
=
'time'
)
.
text
.
strip
()
# 根据链接判重
is_member
=
baseCore
.
r
.
sismember
(
'REITs::'
+
webname
,
href
)
if
is_member
:
log
.
info
(
f
'{title}===已采集'
)
continue
origin
=
'深圳证券交易所'
writtenDate
=
publishDate
organ
=
'深圳证券交易所'
summary
=
''
pub_hao
,
content
,
id_list
,
contentWithTag
=
getContent
(
href
,
publishDate
,
num
,
id_list
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'deleteFlag'
:
0
,
'id'
:
''
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'sourceAddress'
:
href
,
'writtenDate'
:
writtenDate
,
'organ'
:
organ
,
'topicClassification'
:
''
,
'issuedNumber'
:
pub_hao
,
'summary'
:
summary
,
'createDate'
:
time_now
,
'sid'
:
'1730508406971613186'
,
}
try
:
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
href
)
log
.
info
(
f
'采集成功--{title}--{href}'
)
except
:
for
att_id
in
id_list
:
baseCore
.
deliteATT
(
att_id
)
num
+=
1
time
.
sleep
(
3
)
if
__name__
==
'__main__'
:
doJob
()
REITs专题数据/policy-zhejiang.py
浏览文件 @
d7b3c3cf
import
time
import
os
import
os
import
time
from
urllib.parse
import
urljoin
import
requests
from
bs4
import
BeautifulSoup
...
...
@@ -14,8 +16,116 @@ headers = {
'X-Requested-With'
:
'XMLHttpRequest'
,
}
topic
=
'policy'
webname
=
'浙江省人民政府'
@retry
(
tries
=
3
,
delay
=
10
)
class
Policy
():
def
getrequest_soup
(
self
,
headers
,
url
):
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
result
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
result
def
getrequest_json
(
self
,
headers
,
url
):
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
result
=
req
.
json
()
return
result
def
requestPost
(
self
,
headers
,
url
,
payload
):
req
=
requests
.
post
(
headers
=
headers
,
url
=
url
,
data
=
payload
)
data_json
=
req
.
json
()
return
data_json
def
requestPost_html
(
self
,
headers
,
url
,
payload
):
req
=
requests
.
post
(
headers
=
headers
,
url
=
url
,
data
=
payload
)
result
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
result
def
deletep
(
self
,
soup
,
i
,
tag
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的标签并删除
tags
=
soup
.
find_all
(
tag
,
{
attribute_to_delete
:
value_to_delete
})
for
tag
in
tags
[:
i
]:
tag
.
decompose
()
def
deletespan
(
self
,
td
):
spans
=
td
.
find_all
(
'span'
)
for
span
in
spans
:
span
.
extract
()
# 删除span标签
def
deletetag
(
self
,
td
,
tag
):
tags
=
td
.
find_all
(
tag
)
for
tag_
in
tags
:
tag_
.
extract
()
# 删除指定标签
def
deletetext
(
self
,
soup
,
tag
,
text
):
# 删除带有特定内容的标签
tags
=
soup
.
find_all
(
tag
)[:
10
]
for
tag_
in
tags
:
text_
=
tag_
.
text
if
text
in
text_
:
tag_
.
extract
()
def
deletek
(
self
,
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
def
paserUrl
(
self
,
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
def
attuributefile
(
self
,
file_name
,
file_href
,
num
,
publishDate
):
# 下载附件到本地,并上传文件服务器
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
''
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
return
''
,
''
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'RETIs文件'
,
file_name
,
num
,
publishDate
)
return
att_id
,
full_path
else
:
return
''
,
''
policy
=
Policy
()
def
paserUrl
(
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
@retry
(
tries
=
3
,
delay
=
10
)
def
getPageSize
():
ip
=
baseCore
.
get_proxy
()
url
=
'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
...
...
@@ -42,9 +152,10 @@ def getPageSize():
req
.
close
()
return
pageSize
@retry
(
tries
=
3
,
delay
=
10
)
@retry
(
tries
=
3
,
delay
=
10
)
def
getDataJson
(
page
):
ip
=
baseCore
.
get_proxy
()
#
ip = baseCore.get_proxy()
url
=
'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post
=
{
'websiteid'
:
'330000000000000'
,
...
...
@@ -59,18 +170,300 @@ def getDataJson(page):
'pos'
:
'content,filenumber'
,
'sortType'
:
'1'
,
}
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
)
req
.
encoding
=
req
.
apparent_encoding
data_json
=
req
.
json
()[
'result'
]
req
.
close
()
return
data_json
def
getContent
(
url
,
publishDate
,
num
):
id_list
=
[]
req
=
requests
.
get
(
url
,
headers
=
headers
)
if
'weixin'
in
url
:
req
.
encoding
=
'utf-8'
else
:
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'lxml'
)
soup
=
paserUrl
(
soup
,
url
)
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'box_wzy_ys'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'oh_main_cont_flbox_show_cont'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'article-content'
)
try
:
contentWithTag
.
find
(
'table'
,
class_
=
'xxgk_table'
)
.
decompose
()
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'zoom'
})
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'audioBox'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'zcbox'
)
.
decompose
()
div_list
=
soup
.
find_all
(
'div'
,
class_
=
'yybb'
)
for
div
in
div_list
:
div
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'fz_xx'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'a'
,
class_
=
'zcjdlj'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'fenxiang'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'Interpretation'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'a'
,
class_
=
'bmjd'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'a'
,
class_
=
'tjlj'
)
.
decompose
()
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'g_content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'span'
,
class_
=
'zcjdlink'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'main_section'
)
try
:
contentWithTag
=
contentWithTag
.
find
(
'div'
,
class_
=
'main_section'
)
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'zoomnr'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'mainText'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'text'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'wz'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'news_content'
)
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'ywlj'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'zcjd'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'tpjd'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'spjd'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'jgfzr'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'fzr'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'jgdz'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'lxfs'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'gkdh'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'zipcode'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'fax'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'mail'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'bgsj'
)
.
decompose
()
except
:
pass
if
not
contentWithTag
:
try
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'mian'
)
.
find
(
'div'
,
class_
=
'article_text'
)
except
:
contentWithTag
=
None
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'wenz'
)
if
not
contentWithTag
:
# contentWithTag = soup.find('table', attrs={'id': 'word'})
contentWithTag
=
soup
.
find
(
'table'
,
attrs
=
{
'id'
:
'inside'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'ewb-content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'content-info-content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'main-txt'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'zoom'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'showPage'
)
if
not
contentWithTag
:
try
:
contentWithTag
=
soup
.
find_all
(
'div'
,
class_
=
'content'
)[
1
]
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'linke'
)
.
decompose
()
except
:
contentWithTag
=
None
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'article'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'content'
)
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'dy'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'con_top'
)
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'flex_between'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'div'
,
class_
=
'dqwz'
)
.
decompose
()
contentWithTag
.
find
(
'div'
,
class_
=
'top'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'h4'
,
class_
=
'fr'
)
.
decompose
()
except
:
pass
try
:
contentWithTag
.
find
(
'ul'
,
class_
=
'Fileclass'
)
.
decompose
()
contentWithTag
.
find
(
'h4'
)
.
decompose
()
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'main-body'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'articlePage_content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'Gbc_Cm'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'zhengw'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'zhengw'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'xy-detail'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'td'
,
class_
=
'bt_content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'xy-detail'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'js_content'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'cr'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'art_c'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'article'
,
class_
=
'content_main'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
attrs
=
{
'id'
:
'ivs_content'
})
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'con_con'
)
try
:
div_list
=
contentWithTag
.
find
(
'div'
,
class_
=
'yybb'
)
for
div
in
div_list
:
div
.
decompose
()
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'pic'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'td'
,
class_
=
'bt_content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'rich_media_content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'xl_main_con'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'jh_xl_m2'
)
try
:
contentWithTag
.
find
(
'span'
,
class_
=
'jiedu-link-box'
)
.
decompose
()
except
:
pass
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'nrEmit'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'details-content'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'zf-jd-nr'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'article-conter'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'class="rich_media_area_primary"'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)'
)
if
not
contentWithTag
:
contentWithTag
=
soup
.
find
(
'div'
,
class_
=
'detail-pic'
)
try
:
contentWithTag
.
find
(
'video'
)
.
decompose
()
contentWithTag
=
None
except
:
pass
try
:
scripts
=
contentWithTag
.
find_all
(
'script'
)
for
script
in
scripts
:
script
.
decompose
()
except
:
pass
try
:
styles
=
contentWithTag
.
find_all
(
'style'
)
for
style
in
styles
:
style
.
decompose
()
except
:
pass
a_list
=
contentWithTag
.
find_all
(
'a'
)
for
a
in
a_list
:
href
=
a
.
get
(
'href'
)
fj_title
=
a
.
text
.
strip
()
.
lstrip
()
category
=
os
.
path
.
splitext
(
href
)[
1
]
if
category
not
in
fj_title
:
fj_title
=
fj_title
+
category
att_id
,
full_path
=
policy
.
attuributefile
(
fj_title
,
href
,
num
,
publishDate
)
if
att_id
:
id_list
.
append
(
att_id
)
a
[
'href'
]
=
full_path
content
=
contentWithTag
.
text
return
str
(
contentWithTag
),
content
,
id_list
def
getDatas
(
page
):
data_json
=
getDataJson
(
page
)
num
=
1
for
data_
in
data_json
:
soup
=
BeautifulSoup
(
data_
,
'lxml'
)
title
=
soup
.
find
(
'div'
,
class_
=
'titleWrapper'
)
.
find
(
'a'
,
class_
=
'textTitle'
)
.
text
.
lstrip
()
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
'
\r\n
'
,
' '
)
href
=
soup
.
find
(
'div'
,
class_
=
'titleWrapper'
)
.
find
(
'a'
,
class_
=
'textTitle'
)
.
get
(
'href'
)
href
=
href
.
split
(
'url='
)[
1
]
.
split
(
'.html'
)[
0
]
.
replace
(
'
%3
A'
,
':'
)
.
replace
(
'
%2
F'
,
'/'
)
+
'.html'
title
=
soup
.
find
(
'div'
,
class_
=
'titleWrapper'
)
.
find
(
'a'
)
.
text
.
lstrip
()
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
'
\r\n
'
,
' '
)
href
=
soup
.
find
(
'div'
,
class_
=
'titleWrapper'
)
.
find
(
'a'
)
.
get
(
'href'
)
href
=
href
.
split
(
'url='
)[
1
]
.
split
(
'.html'
)[
0
]
.
replace
(
'
%3
A'
,
':'
)
.
replace
(
'
%2
F'
,
'/'
)
+
'.html'
try
:
info
=
soup
.
find
(
'table'
,
class_
=
'fgwj_table_list'
)
.
text
organ
=
info
.
split
(
'发布机构:'
)[
1
]
.
split
(
'成文日期:'
)[
0
]
.
lstrip
()
.
strip
()
...
...
@@ -78,21 +471,48 @@ def getDatas(page):
except
:
organ
=
''
writtenDate
=
None
origin
=
soup
.
find
(
'div'
,
class_
=
'sourceTime'
)
.
text
.
split
(
'来源:'
)[
1
]
.
split
(
'时间:'
)[
0
]
.
lstrip
()
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
' '
,
''
)
.
replace
(
'
\r\n
'
,
''
)
origin
=
soup
.
find
(
'div'
,
class_
=
'sourceTime'
)
.
text
.
split
(
'来源:'
)[
1
]
.
split
(
'时间:'
)[
0
]
.
lstrip
()
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
' '
,
''
)
.
replace
(
'
\r\n
'
,
''
)
publishDate
=
soup
.
find
(
'div'
,
class_
=
'sourceTime'
)
.
text
.
split
(
'时间:'
)[
1
]
.
lstrip
()
.
strip
()
log
.
info
(
origin
)
contentWithTag
,
content
,
id_list
=
getContent
(
href
,
publishDate
,
num
)
num
+=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'deleteFlag'
:
0
,
'id'
:
''
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'sourceAddress'
:
href
,
'writtenDate'
:
writtenDate
,
'organ'
:
organ
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'summary'
:
''
,
'createDate'
:
time_now
,
'sid'
:
'1729041791539326977'
,
}
try
:
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
href
)
log
.
info
(
f
'{title}===完成'
)
except
:
for
att_id
in
id_list
:
baseCore
.
deliteATT
(
att_id
)
log
.
error
(
f
'第{page}页==={title}===失败'
)
time
.
sleep
(
5
)
def
doJob
():
pageSize
=
getPageSize
()
for
page
in
range
(
1
,
pageSize
+
1
):
datas
=
getDatas
(
page
)
getDatas
(
page
)
if
__name__
==
'__main__'
:
doJob
()
# url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
# req = requests.get(url,headers=headers)
# req.encoding = req.apparent_encoding
baseCore
.
close
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论