Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
5d788bc9
提交
5d788bc9
authored
11月 28, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reits专题
上级
c702fb7b
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
155 行增加
和
209 行删除
+155
-209
BaseCore.py
REITs专题数据/BaseCore.py
+19
-26
reits.py
REITs专题数据/reits.py
+136
-183
没有找到文件。
REITs专题数据/BaseCore.py
浏览文件 @
5d788bc9
# REI
Ts专题核心工具包
# REI
Ts专题核心工具包
...
@@ -5,6 +5,7 @@ import random
...
@@ -5,6 +5,7 @@ import random
import
socket
import
socket
import
sys
import
sys
import
time
import
time
import
uuid
import
fitz
import
fitz
import
logbook
import
logbook
...
@@ -252,7 +253,7 @@ class BaseCore:
...
@@ -252,7 +253,7 @@ class BaseCore:
charset
=
'utf8mb4'
)
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
self
.
cursor_
=
self
.
cnx_
.
cursor
()
# 连接到Redis
# 连接到Redis
self
.
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
0
)
self
.
pool_caiji
=
PooledDB
(
self
.
pool_caiji
=
PooledDB
(
creator
=
pymysql
,
creator
=
pymysql
,
...
@@ -451,6 +452,7 @@ class BaseCore:
...
@@ -451,6 +452,7 @@ class BaseCore:
# def doc_page(self,file_path):
# def doc_page(self,file_path):
# doc = Document(file_path)
# doc = Document(file_path)
# return len(doc.sections)
# return len(doc.sections)
def
deliteATT
(
self
,
id
):
def
deliteATT
(
self
,
id
):
delitesql
=
f
"delete from clb_sys_attachment where id = '{id}' "
delitesql
=
f
"delete from clb_sys_attachment where id = '{id}' "
self
.
cursor_
.
execute
(
delitesql
)
self
.
cursor_
.
execute
(
delitesql
)
...
@@ -492,6 +494,9 @@ class BaseCore:
...
@@ -492,6 +494,9 @@ class BaseCore:
id
=
selects
[
0
]
id
=
selects
[
0
]
return
id
,
full_path
return
id
,
full_path
def
getuuid
(
self
):
get_timestamp_uuid
=
uuid
.
uuid1
()
# 根据 时间戳生成 uuid , 保证全球唯一
return
get_timestamp_uuid
# 获取文件大小
# 获取文件大小
def
convert_size
(
self
,
size_bytes
):
def
convert_size
(
self
,
size_bytes
):
...
@@ -520,37 +525,25 @@ class BaseCore:
...
@@ -520,37 +525,25 @@ class BaseCore:
except
:
except
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
continue
continue
page_size
=
0
for
i
in
range
(
0
,
3
):
for
i
in
range
(
0
,
3
):
try
:
try
:
# name = file_name
file_name
=
str
(
self
.
getuuid
())
+
category
if
category
in
file_name
:
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocument/'
+
file_name
,
content
=
response
.
content
)
pass
else
:
file_name
=
file_name
+
category
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
file_name
,
content
=
response
.
content
)
break
break
except
:
except
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
continue
continue
try
:
if
page_size
<
1
:
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# pdf解析失败
retData
[
'state'
]
=
True
# print(f'======pdf解析失败=====')
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
return
retData
retData
[
'full_path'
]
=
unquote
(
result
[
'body'
][
'objectUrl'
])
else
:
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
try
:
retData
[
'create_time'
]
=
time_now
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
except
Exception
as
e
:
retData
[
'state'
]
=
True
print
(
f
'error:{e}'
)
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'full_path'
]
=
unquote
(
result
[
'body'
][
'objectUrl'
])
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
retData
[
'create_time'
]
=
time_now
except
Exception
as
e
:
print
(
f
'error:{e}'
)
return
retData
return
retData
return
retData
return
retData
def
sendkafka
(
self
,
post_data
,
topic
):
def
sendkafka
(
self
,
post_data
,
topic
):
try
:
try
:
...
...
REITs专题数据/reits.py
浏览文件 @
5d788bc9
impor
t
os
impor
t
os
...
@@ -107,7 +107,7 @@ class Policy():
...
@@ -107,7 +107,7 @@ class Policy():
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'
9999
'
,
file_name
)
retData
=
baseCore
.
uptoOBS
(
file_href
,
''
,
file_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
pass
pass
else
:
else
:
...
@@ -136,7 +136,7 @@ class Policy():
...
@@ -136,7 +136,7 @@ class Policy():
policy
=
Policy
()
policy
=
Policy
()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def
reform
(
wb
,
file_path
):
def
reform
():
headers
=
{
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
...
@@ -153,22 +153,30 @@ def reform(wb,file_path):
...
@@ -153,22 +153,30 @@ def reform(wb,file_path):
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
'sec-ch-ua-platform'
:
'"Windows"'
}
}
DataList
=
[]
#
DataList = []
num
=
0
num
=
0
path
=
'data/国家改革发展委员会'
webname
=
'中华人民共和国国家发展和改革委员会'
if
not
os
.
path
.
exists
(
path
):
# path = 'data/国家改革发展委员会'
os
.
makedirs
(
path
)
# if not os.path.exists(path):
# os.makedirs(path)
for
page
in
range
(
1
,
3
):
for
page
in
range
(
1
,
3
):
url
=
f
'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
url
=
f
'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
result
=
policy
.
getrequest_json
(
headers
,
url
)
result
=
policy
.
getrequest_json
(
headers
,
url
)
data_list
=
result
[
'data'
][
'resultList'
]
data_list
=
result
[
'data'
][
'resultList'
]
for
info
in
data_list
:
for
info
in
data_list
:
num
+=
1
num
+=
1
id_list
=
[]
# info = data_list[1]
# info = data_list[1]
publishDate_
=
info
[
'docDate'
]
publishDate_
=
info
[
'docDate'
]
title
=
info
[
'title'
]
title
=
info
[
'title'
]
summary
=
info
[
'summary'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
summary
=
info
[
'summary'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
newsUrl
=
info
[
'url'
]
newsUrl
=
info
[
'url'
]
# 根据链接判重
is_member
=
baseCore
.
r
.
sismember
(
'REITs::'
+
webname
,
newsUrl
)
if
is_member
:
continue
header
=
{
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
...
@@ -190,6 +198,7 @@ def reform(wb,file_path):
...
@@ -190,6 +198,7 @@ def reform(wb,file_path):
}
}
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
# print(newssoup)
# print(newssoup)
policy
.
paserUrl
(
newssoup
,
newsUrl
)
try
:
try
:
pubHao
=
''
pubHao
=
''
source
=
''
source
=
''
...
@@ -229,20 +238,19 @@ def reform(wb,file_path):
...
@@ -229,20 +238,19 @@ def reform(wb,file_path):
pattern
=
r"\d{4}年\d{1,2}月\d{1,2}日"
pattern
=
r"\d{4}年\d{1,2}月\d{1,2}日"
match
=
re
.
match
(
pattern
,
publishDate
)
match
=
re
.
match
(
pattern
,
publishDate
)
if
match
:
if
match
:
date1
=
datetime
.
strptime
(
publishDate
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
date1
.
strftime
(
"
%
Y-
%
m-
%
d"
)
pass
pass
else
:
else
:
publishDate
=
''
publishDate
=
''
policy
.
deletep
(
contentWithTag
,
3
,
'div'
,
'style'
,
'text-align: center;'
)
policy
.
deletep
(
contentWithTag
,
3
,
'div'
,
'style'
,
'text-align: center;'
)
policy
.
deletek
(
contentWithTag
)
policy
.
deletek
(
contentWithTag
)
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
try
:
try
:
policy
.
paserUrl
(
newssoup
,
newsUrl
)
policy
.
paserUrl
(
newssoup
,
newsUrl
)
att
=
newssoup
.
find
(
'div'
,
class_
=
'attachment_r'
)
att
=
newssoup
.
find
(
'div'
,
class_
=
'attachment_r'
)
fu_jian_name
=
''
fu_jian_href
=
''
except
:
except
:
fu_jian_name
=
''
fu_jian_href
=
''
att
=
''
att
=
''
if
att
:
if
att
:
for
a
in
att
.
find_all
(
'a'
):
for
a
in
att
.
find_all
(
'a'
):
...
@@ -255,49 +263,61 @@ def reform(wb,file_path):
...
@@ -255,49 +263,61 @@ def reform(wb,file_path):
pass
pass
else
:
else
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
rename_file
=
f
'{str(num)}_{publishDate}_{file_name}'
att_id
,
full_path
=
policy
.
attuributefile
(
file_name
,
file_href
,
num
,
publishDate_
)
fu_jian_name
+=
rename_file
+
'
\n
'
if
att_id
:
fu_jian_href
+=
file_href
+
'
\n
'
id_list
.
append
(
att_id
)
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
a
[
'href'
]
=
full_path
contentWithTag_str
=
str
(
contentWithTag
)
+
str
(
newssoup
.
find
(
'div'
,
class_
=
'attachment'
))
else
:
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
dic_info
=
{
'序号'
:
num
,
'attachmentIds'
:
id_list
,
'标题'
:
title
,
'author'
:
''
,
'发布时间'
:
publishDate_
,
'content'
:
content
,
'来源'
:
source
,
'contentWithTag'
:
contentWithTag_str
,
'原文链接'
:
newsUrl
,
'deleteFlag'
:
0
,
'发文时间'
:
publishDate
,
'id'
:
''
,
'发文机构'
:
''
,
'title'
:
title
,
'发文字号'
:
pubHao
,
'publishDate'
:
publishDate_
,
'摘要'
:
summary
,
'origin'
:
source
,
'正文'
:
content
,
'sourceAddress'
:
newsUrl
,
'附件名称'
:
fu_jian_name
,
'writtenDate'
:
publishDate
,
'附件链接'
:
fu_jian_href
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
pubHao
,
'summary'
:
summary
,
'createDate'
:
time_now
,
'sid'
:
'1729029275400646658'
,
}
}
DataList
.
append
(
dic_info
)
#
DataList.append(dic_info)
try
:
try
:
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
sendkafka
(
dic_info
,
topic
)
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
newsUrl
)
log
.
info
(
f
'采集成功--{title}--{newsUrl}'
)
except
:
except
:
for
att_id
in
id_list
:
sheet_name
=
"国家发展和改革委员会"
baseCore
.
deliteATT
(
att_id
)
if
sheet_name
in
wb
.
sheetnames
:
# sheet_name = "国家发展和改革委员会"
log
.
info
(
f
"{sheet_name}工作表已存在!"
)
# if sheet_name in wb.sheetnames:
else
:
# log.info(f"{sheet_name}工作表已存在!")
# 创建新工作表
# else:
wb
.
create_sheet
(
sheet_name
)
# # 创建新工作表
print
(
f
"{sheet_name}新工作表创建完成!"
)
# wb.create_sheet(sheet_name
)
#
保存Excel文件
#
print(f"{sheet_name}新工作表创建完成!")
wb
.
save
(
file_path
)
# # 保存Excel文件
# wb.save(file_path)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
"error!!!{newsUrl}"
)
log
.
info
(
f
"error!!!{newsUrl}"
)
log
.
info
({
e
})
log
.
info
({
e
})
log
.
info
(
f
'====第{page}页====处理结束,已采集{num}条数据================='
)
log
.
info
(
f
'====第{page}页====处理结束,已采集{num}条数据================='
)
#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
def
zhengquanqihuo
(
wb
,
file_path
):
def
zhengquanqihuo
():
headers
=
{
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
...
@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path):
...
@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path):
total
=
pageUtil
[
'rowCount'
]
total
=
pageUtil
[
'rowCount'
]
page_size
=
pageUtil
[
'pageSize'
]
page_size
=
pageUtil
[
'pageSize'
]
Max_page
=
int
(
total
/
page_size
)
Max_page
=
int
(
total
/
page_size
)
DataList
=
[]
#
DataList = []
num
=
0
num
=
0
path
=
'data/证监会'
webname
=
'证券期货法规数据库系统'
if
not
os
.
path
.
exists
(
path
):
# path = 'data/证监会'
os
.
makedirs
(
path
)
# if not os.path.exists(path):
# os.makedirs(path)
for
page
in
range
(
0
,
Max_page
+
1
):
for
page
in
range
(
0
,
Max_page
+
1
):
payload_page
=
{
payload_page
=
{
'pageNo'
:
page
+
1
,
'pageNo'
:
page
+
1
,
...
@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path):
...
@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path):
data_page
=
policy
.
requestPost
(
headers
,
url
,
payload_page
)
data_page
=
policy
.
requestPost
(
headers
,
url
,
payload_page
)
info_list
=
data_page
[
'pageUtil'
][
'pageList'
]
info_list
=
data_page
[
'pageUtil'
][
'pageList'
]
for
info
in
info_list
:
for
info
in
info_list
:
id_list
=
[]
num
+=
1
num
+=
1
try
:
try
:
title
=
info
[
'secFutrsLawName'
]
title
=
info
[
'secFutrsLawName'
]
...
@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path):
...
@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path):
# print(publishDate)
# print(publishDate)
secFutrsLawId
=
info
[
'secFutrsLawId'
]
secFutrsLawId
=
info
[
'secFutrsLawId'
]
newsUrl
=
f
'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
newsUrl
=
f
'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
# 根据链接判重
is_member
=
baseCore
.
r
.
sismember
(
'REITs::'
+
webname
,
newsUrl
)
if
is_member
:
continue
browser
=
policy
.
createDriver
()
browser
=
policy
.
createDriver
()
browser
.
get
(
newsUrl
)
browser
.
get
(
newsUrl
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
page_source
=
browser
.
page_source
page_source
=
browser
.
page_source
newssoup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
newssoup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
policy
.
paserUrl
(
newssoup
,
newsUrl
)
# print(newssoup)
# print(newssoup)
contentWithTag
=
newssoup
.
find
(
'div'
,
class_
=
'law_text mainBody catalog'
)
contentWithTag
=
newssoup
.
find
(
'div'
,
class_
=
'law_text mainBody catalog'
)
content
=
contentWithTag
.
text
.
replace
(
'显示注释'
,
''
)
content
=
contentWithTag
.
text
.
replace
(
'显示注释'
,
''
)
# print(content)
# print(content)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
dic_info
=
{
'序号'
:
num
,
'attachmentIds'
:
id_list
,
'标题'
:
title
,
'author'
:
''
,
'发布时间'
:
publishDate
,
'content'
:
content
,
'来源'
:
source
,
'contentWithTag'
:
contentWithTag_str
,
'原文链接'
:
newsUrl
,
'deleteFlag'
:
0
,
'发文时间'
:
publishDate
,
'id'
:
''
,
'发文机构'
:
source
,
'title'
:
title
,
'发文字号'
:
pubHao
,
'publishDate'
:
publishDate
,
'摘要'
:
''
,
'origin'
:
source
,
'正文'
:
content
,
'sourceAddress'
:
newsUrl
,
'附件名称'
:
''
,
'writtenDate'
:
publishDate
,
'附件链接'
:
''
,
'organ'
:
source
,
'issuedNumber'
:
pubHao
,
'summary'
:
''
,
'topicClassification'
:
''
,
'createDate'
:
time_now
,
'sid'
:
'1729030277461815298'
,
}
}
DataList
.
append
(
dic_info
)
try
:
sheet_name
=
"证监会"
baseCore
.
sendkafka
(
dic_info
,
topic
)
if
sheet_name
in
wb
.
sheetnames
:
baseCore
.
r
.
sadd
(
'REITs::'
+
webname
,
newsUrl
)
log
.
info
(
f
"{sheet_name}工作表已存在!"
)
log
.
info
(
f
'采集成功--{title}--{newsUrl}'
)
else
:
except
:
# 创建新工作表
for
att_id
in
id_list
:
wb
.
create_sheet
(
sheet_name
)
baseCore
.
deliteATT
(
att_id
)
print
(
f
"{sheet_name}新工作表创建完成!"
)
# DataList.append(dic_info)
# 保存Excel文件
# sheet_name = "证监会"
wb
.
save
(
file_path
)
# if sheet_name in wb.sheetnames:
# log.info(f"{sheet_name}工作表已存在!")
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
# else:
# # 创建新工作表
# wb.create_sheet(sheet_name)
# print(f"{sheet_name}新工作表创建完成!")
# # 保存Excel文件
# wb.save(file_path)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
"error!!!{num}"
)
log
.
info
(
f
"error!!!{num}"
)
log
.
info
({
e
})
log
.
info
({
e
})
...
@@ -428,9 +472,10 @@ def sse(wb,file_path):
...
@@ -428,9 +472,10 @@ def sse(wb,file_path):
total_page
=
result
[
'data'
][
'totalPage'
]
total_page
=
result
[
'data'
][
'totalPage'
]
DataList
=
[]
DataList
=
[]
num
=
0
num
=
0
path
=
'data/上海交易所'
webname
=
'上海证券交易所'
if
not
os
.
path
.
exists
(
path
):
# path = 'data/上海交易所'
os
.
makedirs
(
path
)
# if not os.path.exists(path):
# os.makedirs(path)
for
page
in
range
(
0
,
int
(
total_page
)):
for
page
in
range
(
0
,
int
(
total_page
)):
url_page
=
f
'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url_page
=
f
'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
data
=
policy
.
getrequest_json
(
headers
,
url_page
)
data
=
policy
.
getrequest_json
(
headers
,
url_page
)
...
@@ -456,9 +501,14 @@ def sse(wb,file_path):
...
@@ -456,9 +501,14 @@ def sse(wb,file_path):
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
}
newsUrl
=
'http://www.sse.com.cn'
+
news
[
'extend'
][
4
][
'value'
]
newsUrl
=
'http://www.sse.com.cn'
+
news
[
'extend'
][
4
][
'value'
]
# 根据链接判重
is_member
=
baseCore
.
r
.
sismember
(
'REITs::'
+
webname
,
newsUrl
)
if
is_member
:
continue
if
'.pdf'
in
newsUrl
:
if
'.pdf'
in
newsUrl
:
fu_jian_name
=
''
fu_jian_href
=
''
content
=
''
content
=
''
response
=
requests
.
get
(
newsUrl
,
timeout
=
20
)
response
=
requests
.
get
(
newsUrl
,
timeout
=
20
)
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
...
@@ -466,10 +516,10 @@ def sse(wb,file_path):
...
@@ -466,10 +516,10 @@ def sse(wb,file_path):
content
+=
page
.
get_text
()
content
+=
page
.
get_text
()
file_href
=
newsUrl
file_href
=
newsUrl
file_name
=
title
file_name
=
title
rename_file
=
f
'{str(num)}_{publishDate}_{file_name}'
fu_jian_name
+=
rename_file
+
'
\n
'
policy
.
attuributefile
(
title
,
newsUrl
,
num
,
publishDate
)
fu_jian_href
+=
file_href
+
'
\n
'
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
dic_info
=
{
dic_info
=
{
'序号'
:
num
,
'序号'
:
num
,
'标题'
:
title
,
'标题'
:
title
,
...
@@ -553,100 +603,6 @@ def sse(wb,file_path):
...
@@ -553,100 +603,6 @@ def sse(wb,file_path):
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def
beijing
():
url
=
'https://www.beijing.gov.cn/so/ss/query/s'
payload
=
{
'siteCode'
:
'1100000088'
,
'tab'
:
'zcfg'
,
'qt'
:
'REITs'
,
'sort'
:
'relevance'
,
'keyPlace'
:
'0'
,
'locationCode'
:
'110000000000'
,
'page'
:
'1'
,
'pageSize'
:
'20'
,
'ie'
:
'89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'148'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=
%5
B
%22%22%2
C
%22%22%2
C1699515166
%2
C
%22
https
%3
A
%2
F
%2
Fdocs.qq.com
%2
F
%22%5
D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF'
,
'Host'
:
'www.beijing.gov.cn'
,
'Origin'
:
'https://www.beijing.gov.cn'
,
'Referer'
:
'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
result
=
policy
.
requestPost
(
headers
,
url
,
payload
)
total
=
result
[
'totalHits'
]
page_size
=
result
[
'currentHits'
]
Max_page
=
int
(
total
/
page_size
)
for
page
in
range
(
0
,
Max_page
):
payload_page
=
{
'siteCode'
:
'1100000088'
,
'tab'
:
'zcfg'
,
'qt'
:
'REITs'
,
'sort'
:
'relevance'
,
'keyPlace'
:
'0'
,
'locationCode'
:
'110000000000'
,
'page'
:
page
+
1
,
'pageSize'
:
'20'
,
'ie'
:
'89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data
=
policy
.
requestPost
(
headers
,
url
,
payload_page
)
info_list
=
data
[
'resultDocs'
]
# print(info_list)
for
info_
in
info_list
:
info
=
info_
[
'data'
]
title
=
info
[
'titleO'
]
titleLabel
=
info
[
'titleLabel'
][
'value'
]
publishDate
=
info
[
'docDate'
]
# source = info['siteLabel']['value']
newsUrl
=
info
[
'url'
]
if
titleLabel
==
'政策解读'
:
newssoup
=
policy
.
getrequest_soup
(
headers
,
newsUrl
)
print
(
newssoup
)
contentWithTag
=
newssoup
.
find
(
'div'
,
id
=
'mainText'
)
content
=
contentWithTag
.
text
source
=
newssoup
.
select
(
'p[class="fl"]>span'
)[
1
]
.
replace
(
'来源:'
,
''
)
formatRows
=
info
[
'formatRows'
]
num
=
1
for
row
in
formatRows
:
for
col
in
row
[
'col'
]:
name
=
col
[
'text'
]
if
name
==
'相关附件'
:
value
=
col
[
'value'
]
file_href
=
value
.
keys
()
file_name
=
value
.
values
()
# 附件上传
policy
.
attuributefile
(
file_name
,
file_href
,
num
,
publishDate
)
num
+=
1
value
=
col
[
'value'
][
0
]
dic_info
[
name
]
=
value
dic_info
=
{
'title'
:
title
,
'publishDate'
:
publishDate
,
'source'
:
source
,
'newsUrl'
:
newsUrl
,
'file_href'
:
file_href
}
# print(dic_info)
# break
# 河北省人民政府
# 河北省人民政府
def
hebei
():
def
hebei
():
path
=
'data/河北省人民政府'
path
=
'data/河北省人民政府'
...
@@ -851,10 +807,6 @@ def hebei():
...
@@ -851,10 +807,6 @@ def hebei():
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
break
break
# 广东省人民政府
def
guangdong
():
pass
# 贵州省人民政府
# 贵州省人民政府
def
guizhou
():
def
guizhou
():
...
@@ -963,12 +915,12 @@ def guizhou():
...
@@ -963,12 +915,12 @@ def guizhou():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
file_path
=
f
'data/REITs贵州省人民政府.xlsx'
#
file_path = f'data/REITs贵州省人民政府.xlsx'
wb
=
policy
.
createfile
(
file_path
)
#
wb = policy.createfile(file_path)
# reform(
wb,file_path
)
# reform()
# shenzhen()
# shenzhen()
# zhengquanqihuo(wb,file_path
)
zhengquanqihuo
(
)
# sse(
wb,file_path
)
# sse()
# hebei()
# hebei()
guizhou
()
#
guizhou()
# zhengquanqihuo()
# zhengquanqihuo()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论