Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
fa46345c
提交
fa46345c
authored
11月 29, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
11.29
上级
7e42c8e8
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
131 行增加
和
114 行删除
+131
-114
reits.py
REITs专题数据/reits.py
+131
-114
没有找到文件。
REITs专题数据/reits.py
浏览文件 @
fa46345c
impor
t
os
impor
t
os
...
...
@@ -508,52 +508,67 @@ def sse():
is_member
=
baseCore
.
r
.
sismember
(
'REITs::'
+
webname
,
newsUrl
)
if
is_member
:
continue
try
:
if
'.pdf'
in
newsUrl
:
# pass
content
=
''
response
=
requests
.
get
(
newsUrl
,
timeout
=
20
)
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
for
page
in
doc
.
pages
():
content
+=
page
.
get_text
()
file_href
=
newsUrl
file_name
=
title
att_id
,
full_path
=
policy
.
attuributefile
(
title
,
newsUrl
,
num
,
publishDate
)
if
att_id
:
id_list
.
append
(
att_id
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
if
'.pdf'
in
newsUrl
:
content
=
''
response
=
requests
.
get
(
newsUrl
,
timeout
=
20
)
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
for
page
in
doc
.
pages
():
content
+=
page
.
get_text
()
file_href
=
newsUrl
file_name
=
title
policy
.
attuributefile
(
title
,
newsUrl
,
num
,
publishDate
)
dic_info
=
{
'序号'
:
num
,
'标题'
:
title
,
'发布时间'
:
publishDate
,
'来源'
:
source
,
'原文链接'
:
newsUrl
,
'发文时间'
:
''
,
'发文机构'
:
''
,
'发文字号'
:
''
,
'摘要'
:
summary
,
'正文'
:
content
,
'附件名称'
:
fu_jian_name
,
'附件链接'
:
fu_jian_href
,
}
DataList
.
append
(
dic_info
)
else
:
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
# print(newssoup)
content_
=
newssoup
.
find
(
'div'
,
class_
=
'allZoom'
)
# print(content_)
# # 将链接替换为绝对路径
contentWithTag
=
policy
.
paserUrl
(
content_
,
newsUrl
)
try
:
pubHao
=
contentWithTag
.
find
(
'p'
,
style
=
'text-align: center;'
)
.
text
.
strip
(
' '
)
if
'〔'
in
pubHao
:
pass
else
:
dic_info
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
''
,
'deleteFlag'
:
0
,
'id'
:
''
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'origin'
:
source
,
'sourceAddress'
:
newsUrl
,
'writtenDate'
:
None
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'summary'
:
summary
,
'createDate'
:
time_now
,
'sid'
:
'1729035244826374145'
,
}
# DataList.append(dic_info)
try
:
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
newsUrl
)
log
.
info
(
f
'采集成功--{title}--{newsUrl}'
)
except
:
for
att_id
in
id_list
:
baseCore
.
deliteATT
(
att_id
)
else
:
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
# print(newssoup)
policy
.
paserUrl
(
newssoup
,
newsUrl
)
content_
=
newssoup
.
find
(
'div'
,
class_
=
'allZoom'
)
# print(content_)
# # 将链接替换为绝对路径
contentWithTag
=
policy
.
paserUrl
(
content_
,
newsUrl
)
try
:
pubHao
=
contentWithTag
.
find
(
'p'
,
style
=
'text-align: center;'
)
.
text
.
strip
(
' '
)
if
'〔'
in
pubHao
:
pass
else
:
pubHao
=
''
except
:
pubHao
=
''
except
:
pubHao
=
''
# print(contentWithTag)
content
=
contentWithTag
.
text
# print(contentWithTag)
content
=
contentWithTag
.
text
fujian_list
=
contentWithTag
.
find_all
(
'a'
)
...
...
@@ -753,78 +768,78 @@ def hebei():
else
:
continue
writeDate_
=
p
.
text
pattern
=
r"\d{4}年\d{1,2}月\d{1,2}日"
match
=
re
.
search
(
pattern
,
writeDate_
)
if
match
:
writeDate
=
match
.
group
(
0
)
break
writeDate_
=
p
.
text
pattern
=
r"\d{4}年\d{1,2}月\d{1,2}日"
match
=
re
.
search
(
pattern
,
writeDate_
)
if
match
:
writeDate1
=
match
.
group
(
0
)
date2
=
datetime
.
strptime
(
writeDate1
,
"
%
Y年
%
m月
%
d日"
)
writeDate
=
date2
.
strftime
(
"
%
Y-
%
m-
%
d"
)
break
else
:
continue
except
:
try
:
contentWithTag
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
content
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
.
text
info
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk-xqy-touxx'
)
policy
.
deletespan
(
info
)
pub_hao
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx4'
)
.
text
pub_origin
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx3'
)
.
text
writeDate
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx5'
)
.
text
except
:
pass
# 附件:
try
:
fujian_href
=
contentWithTag
.
find_all
(
'a'
)
for
file_href_
in
fujian_href
:
file_href
=
file_href_
[
'href'
]
file_name
=
file_href_
.
text
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
in
file_name
:
pass
else
:
continue
except
:
file_name
=
file_name
+
category
att_id
,
full_path
=
policy
.
attuributefile
(
file_name
,
file_href
,
num
,
publishDate
)
if
att_id
:
id_list
.
append
(
att_id
)
file_href_
[
'href'
]
=
full_path
contentWithTag_str
=
str
(
contentWithTag
)
except
Exception
as
e
:
contentWithTag_str
=
str
(
contentWithTag
)
if
content
==
''
:
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'title'
:
title
.
replace
(
'
\n
'
,
''
),
'publishDate'
:
publishDate
,
'origin'
:
source
,
'sourceAddress'
:
news_href
,
'writtenDate'
:
writeDate
,
'organ'
:
pub_origin
,
'issuedNumber'
:
pub_hao
,
'summary'
:
summary
.
replace
(
'
\n
'
,
''
),
'createDate'
:
time_now
,
'sid'
:
'1729041576348274689'
,
}
# print(dic_info)
try
:
contentWithTag
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
content
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
.
text
info
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk-xqy-touxx'
)
policy
.
deletespan
(
info
)
pub_hao
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx4'
)
.
text
pub_origin
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx3'
)
.
text
writeDate
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx5'
)
.
text
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
news_href
)
log
.
info
(
f
'采集成功--{title}--{news_href}'
)
except
:
pass
# 附件:
fu_jian_name
=
''
fu_jian_href
=
''
try
:
fujian_href
=
contentWithTag
.
find_all
(
'a'
)
policy
.
paserUrl
(
contentWithTag
,
news_href
)
for
file_href_
in
fujian_href
:
file_href
=
file_href_
[
'href'
]
file_name
=
file_href_
.
text
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
in
file_name
:
pass
else
:
file_name
=
file_name
+
category
rename_file
=
f
'{str(num)}_{publishDate}_{file_name}'
fu_jian_name
+=
rename_file
+
'
\n
'
fu_jian_href
+=
file_href
+
'
\n
'
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
for
att_id
in
id_list
:
baseCore
.
deliteATT
(
att_id
)
except
Exception
as
e
:
pass
if
content
==
''
:
continue
dic_info
=
{
'序号'
:
num
,
'标题'
:
title
.
replace
(
'
\n
'
,
''
),
'发布时间'
:
publishDate
,
'来源'
:
source
,
'原文链接'
:
news_href
,
'发文时间'
:
writeDate
,
'发文机构'
:
pub_origin
,
'发文字号'
:
pub_hao
,
'摘要'
:
summary
.
replace
(
'
\n
'
,
''
),
'正文'
:
content
,
'附件名称'
:
fu_jian_name
,
'附件链接'
:
fu_jian_href
,
}
print
(
dic_info
)
DataList
.
append
(
dic_info
)
sheet_name
=
appName
if
sheet_name
in
wb
.
sheetnames
:
log
.
info
(
f
"{sheet_name}工作表已存在!"
)
else
:
# 创建新工作表
wb
.
create_sheet
(
sheet_name
)
print
(
f
"{sheet_name}新工作表创建完成!"
)
# 保存Excel文件
wb
.
save
(
file_path
)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
break
log
.
info
(
f
"error!!!{news_href}"
)
log
.
info
(
e
)
log
.
info
(
f
'====第{page}页====处理结束,已采集{num}条数据================='
)
# 贵州省人民政府
def
guizhou
():
...
...
@@ -948,6 +963,7 @@ if __name__=="__main__":
# shenzhen()
# zhengquanqihuo()
# sse()
#
hebei()
hebei
()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论