Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
6173700f
提交
6173700f
authored
11月 16, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
11/16
上级
d3fd7612
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
288 行增加
和
19 行删除
+288
-19
reits.py
REITs专题数据/reits.py
+279
-10
公告补采3.py
comData/noticeReport/公告补采3.py
+9
-9
没有找到文件。
REITs专题数据/reits.py
浏览文件 @
6173700f
impor
t
os
impor
t
os
...
@@ -36,6 +36,11 @@ class Policy():
...
@@ -36,6 +36,11 @@ class Policy():
data_json
=
req
.
json
()
data_json
=
req
.
json
()
return
data_json
return
data_json
def
requestPost_html
(
self
,
headers
,
url
,
payload
):
req
=
requests
.
post
(
headers
=
headers
,
url
=
url
,
data
=
payload
)
result
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
result
def
createDriver
(
self
):
def
createDriver
(
self
):
chrome_driver
=
r'D:\cmd100\chromedriver.exe'
chrome_driver
=
r'D:\cmd100\chromedriver.exe'
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
...
@@ -48,11 +53,22 @@ class Policy():
...
@@ -48,11 +53,22 @@ class Policy():
return
driver
return
driver
def
deletep
(
self
,
soup
,
i
,
tag
,
attribute_to_delete
,
value_to_delete
):
def
deletep
(
self
,
soup
,
i
,
tag
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的
P
标签并删除
# 查找带有指定属性的标签并删除
tags
=
soup
.
find_all
(
tag
,
{
attribute_to_delete
:
value_to_delete
})
tags
=
soup
.
find_all
(
tag
,
{
attribute_to_delete
:
value_to_delete
})
for
tag
in
tags
[:
i
]:
for
tag
in
tags
[:
i
]:
tag
.
decompose
()
tag
.
decompose
()
def
deletespan
(
self
,
td
):
spans
=
td
.
find_all
(
'span'
)
for
span
in
spans
:
span
.
extract
()
# 删除span标签
def
deletetag
(
self
,
td
,
tag
):
tags
=
td
.
find_all
(
tag
)
for
tag_
in
tags
:
tag_
.
extract
()
# 删除指定标签
def
deletek
(
self
,
soup
):
def
deletek
(
self
,
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
...
@@ -386,7 +402,6 @@ def zhengquanqihuo(wb,file_path):
...
@@ -386,7 +402,6 @@ def zhengquanqihuo(wb,file_path):
#深圳交易所 http://www.szse.cn/lawrules/index.html
#深圳交易所 http://www.szse.cn/lawrules/index.html
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def
sse
(
wb
,
file_path
):
def
sse
(
wb
,
file_path
):
url
=
'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
url
=
'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
...
@@ -483,20 +498,24 @@ def sse(wb,file_path):
...
@@ -483,20 +498,24 @@ def sse(wb,file_path):
fu_jian_name
=
''
fu_jian_name
=
''
fu_jian_href
=
''
fu_jian_href
=
''
for
fujian
in
fujian_list
:
for
fujian
in
fujian_list
:
file_href
=
fujian
[
'href'
]
try
:
file_href
=
fujian
[
'href'
]
except
:
continue
file_name
=
fujian
.
text
.
strip
(
' '
)
file_name
=
fujian
.
text
.
strip
(
' '
)
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
in
file_name
:
if
category
in
file_name
:
pass
pass
else
:
else
:
file_name
=
file_name
+
category
file_name
=
file_name
+
category
rename_file
=
f
'{str(num)}_{publishDate
}_{file_name}'
.
replace
(
'
\\
'
,
''
)
.
replace
(
'/'
,
''
)
.
replace
(
'|'
,
''
)
.
replace
(
'>'
,
''
)
.
replace
(
'<'
,
''
)
.
replace
(
'*'
,
''
)
.
replace
(
':'
,
''
)
.
replace
(
'?'
,
''
)
.
replace
(
'—
'
,
''
)
rename_file
=
f
'{str(num)}_{publishDate
[:10]}_{file_name}'
.
replace
(
'
\\
'
,
''
)
.
replace
(
'/'
,
''
)
.
replace
(
'|'
,
''
)
.
replace
(
'>'
,
''
)
.
replace
(
'<'
,
''
)
.
replace
(
'*'
,
''
)
.
replace
(
':'
,
''
)
.
replace
(
'?'
,
''
)
.
replace
(
'—'
,
''
)
.
replace
(
'-
'
,
''
)
fu_jian_name
+=
rename_file
+
'
\n
'
fu_jian_name
+=
rename_file
+
'
\n
'
fu_jian_href
+=
file_href
+
'
\n
'
fu_jian_href
+=
file_href
+
'
\n
'
try
:
try
:
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
except
:
except
:
log
.
info
(
f
'--{page}-{num}======{newsUrl}'
)
log
.
info
(
f
'--{page}-{num}======{newsUrl}'
)
continue
dic_info
=
{
dic_info
=
{
'序号'
:
num
,
'序号'
:
num
,
'标题'
:
title
,
'标题'
:
title
,
...
@@ -525,10 +544,7 @@ def sse(wb,file_path):
...
@@ -525,10 +544,7 @@ def sse(wb,file_path):
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def
beijing
():
def
beijing
():
url
=
'https://www.beijing.gov.cn/so/ss/query/s'
url
=
'https://www.beijing.gov.cn/so/ss/query/s'
payload
=
{
payload
=
{
...
@@ -622,12 +638,264 @@ def beijing():
...
@@ -622,12 +638,264 @@ def beijing():
# print(dic_info)
# print(dic_info)
# break
# break
# 河北省人民政府
def
hebei
():
path
=
'data/河北省人民政府'
if
not
os
.
path
.
exists
(
path
):
os
.
makedirs
(
path
)
num
=
0
url
=
"https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684"
appNames
=
[
'热点专题'
]
for
appName
in
appNames
:
payload
=
{
'qAnd'
:
' '
,
'qOr'
:
' '
,
'qAll'
:
' '
,
'qNot'
:
' '
,
'startTime'
:
' '
,
'endTime'
:
' '
,
'advSearch'
:
' '
,
'originalSearchUrl'
:
' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684'
,
'originalSearch'
:
' '
,
'app'
:
' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0'
,
'searchArea'
:
' '
,
'appName'
:
appName
,
'sr'
:
' score desc'
,
'advtime'
:
' '
,
'advrange'
:
' '
,
'articleType'
:
' '
,
'siteId'
:
' '
,
'siteName'
:
' '
,
'ext'
:
' '
,
'pNo'
:
' 1'
,
'deviceType'
:
' pc'
,
'q2'
:
' '
,
'q'
:
' REITs'
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'907'
,
'Content-Type'
:
'application/x-www-form-urlencoded'
,
'Cookie'
:
'aisearchbehavior=42b33c1f2d22475bb571093346193219; JSESSIONID=251311215A6447AE509141936F4569D4; arialoadData=true'
,
'Host'
:
'www.hebei.gov.cn'
,
'Origin'
:
'https://www.hebei.gov.cn'
,
'Referer'
:
'https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
soup_
=
policy
.
requestPost_html
(
headers
,
url
,
payload
)
# 第一次请求获取页数
pages
=
int
(
soup_
.
find
(
'span'
,
class_
=
'default-result-tolal-records'
)
.
find
(
'span'
)
.
text
)
DataList
=
[]
for
page
in
range
(
1
,
pages
+
1
):
payload_page
=
{
'qAnd'
:
' '
,
'qOr'
:
' '
,
'qAll'
:
' '
,
'qNot'
:
' '
,
'startTime'
:
' '
,
'endTime'
:
' '
,
'advSearch'
:
' '
,
'originalSearchUrl'
:
' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684'
,
'originalSearch'
:
' '
,
'app'
:
' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0'
,
'searchArea'
:
' '
,
'appName'
:
appName
,
'sr'
:
' score desc'
,
'advtime'
:
' '
,
'advrange'
:
' '
,
'articleType'
:
' '
,
'siteId'
:
' '
,
'siteName'
:
' '
,
'ext'
:
' '
,
'pNo'
:
str
(
page
),
'deviceType'
:
' pc'
,
'q2'
:
' '
,
'q'
:
' REITs'
}
soup
=
policy
.
requestPost_html
(
headers
,
url
,
payload_page
)
list_news
=
soup
.
find_all
(
'div'
,
class_
=
'szf-data-tpl1-item'
)
for
news
in
list_news
:
num
+=
1
title
=
news
.
find
(
'h3'
)
.
text
summary
=
news
.
find
(
'div'
)
.
find
(
'p'
,
class_
=
'txtCon'
)
.
text
publishDate
=
news
.
find
(
'div'
)
.
find
(
'p'
,
class_
=
'dates'
)
.
text
.
replace
(
'发布日期:'
,
''
)
.
replace
(
'
\n
'
,
''
)
news_href
=
news
.
find
(
'div'
)
.
find
(
'p'
,
class_
=
'txtCon'
)
.
find
(
'a'
)[
'href'
]
# news_href = 'http://info.hebei.gov.cn//hbszfxxgk/6898876/7026469/7026511/7026506/7033297/index.html'
news_req
=
requests
.
get
(
news_href
,
headers
)
news_soup
=
BeautifulSoup
(
news_req
.
content
,
'html.parser'
)
writeDate
=
''
pub_hao
=
''
source
=
''
content
=
''
pub_origin
=
''
try
:
content
=
news_soup
.
find
(
'div'
,
id
=
'zoom'
)
.
text
contentWithTag
=
news_soup
.
find
(
'div'
,
id
=
'zoom'
)
try
:
source
=
news_soup
.
find
(
'div'
,
class_
=
'article_tit'
)
.
find
(
'li'
,
class_
=
'xl_laiyuan'
)
.
text
except
:
source
=
''
try
:
info_
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_bmxl'
)
policy
.
deletetag
(
info_
,
'strong'
)
policy
.
deletek
(
info_
)
info_list
=
info_
.
find_all
(
'td'
)
pub_origin
=
info_list
[
1
]
.
text
pub_hao
=
info_list
[
2
]
.
text
except
:
# 处理空标签
policy
.
deletek
(
news_soup
)
p_list
=
news_soup
.
find_all
(
'p'
)
for
p
in
p_list
:
text_pubhao
=
p
.
text
if
'号'
in
text_pubhao
and
'〔'
in
text_pubhao
:
pattern
=
r"冀政办字〔\d+〕\d+号"
match
=
re
.
search
(
pattern
,
text_pubhao
)
if
match
:
pub_hao
=
match
.
group
(
0
)
break
else
:
continue
writeDate_
=
p
.
text
pattern
=
r"\d{4}年\d{1,2}月\d{1,2}日"
match
=
re
.
search
(
pattern
,
writeDate_
)
if
match
:
writeDate
=
match
.
group
(
0
)
break
else
:
continue
except
:
try
:
contentWithTag
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
content
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk_xqy-wznr'
)
.
text
info
=
news_soup
.
find
(
'div'
,
class_
=
'xxgk_gfxwjk-xqy-touxx'
)
policy
.
deletespan
(
info
)
pub_hao
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx4'
)
.
text
pub_origin
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx3'
)
.
text
writeDate
=
info
.
find
(
'p'
,
class_
=
'xxgk_gfxwjk-xqy-touxx5'
)
.
text
except
:
pass
# 附件:
fu_jian_name
=
''
fu_jian_href
=
''
try
:
fujian_href
=
contentWithTag
.
find_all
(
'a'
)
policy
.
paserUrl
(
contentWithTag
,
news_href
)
for
file_href_
in
fujian_href
:
file_href
=
file_href_
[
'href'
]
file_name
=
file_href_
.
text
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
in
file_name
:
pass
else
:
file_name
=
file_name
+
category
rename_file
=
f
'{str(num)}_{publishDate}_{file_name}'
fu_jian_name
+=
rename_file
+
'
\n
'
fu_jian_href
+=
file_href
+
'
\n
'
policy
.
downloadfile
(
file_href
,
f
'{path}/{rename_file}'
)
except
Exception
as
e
:
pass
if
content
==
''
:
continue
dic_info
=
{
'序号'
:
num
,
'标题'
:
title
.
replace
(
'
\n
'
,
''
),
'发布时间'
:
publishDate
,
'来源'
:
source
,
'原文链接'
:
news_href
,
'发文时间'
:
writeDate
,
'发文机构'
:
pub_origin
,
'发文字号'
:
pub_hao
,
'摘要'
:
summary
.
replace
(
'
\n
'
,
''
),
'正文'
:
content
,
'附件名称'
:
fu_jian_name
,
'附件链接'
:
fu_jian_href
,
}
print
(
dic_info
)
DataList
.
append
(
dic_info
)
sheet_name
=
appName
if
sheet_name
in
wb
.
sheetnames
:
log
.
info
(
f
"{sheet_name}工作表已存在!"
)
else
:
# 创建新工作表
wb
.
create_sheet
(
sheet_name
)
print
(
f
"{sheet_name}新工作表创建完成!"
)
# 保存Excel文件
wb
.
save
(
file_path
)
baseCore
.
writerToExcel
(
DataList
,
file_path
,
sheet_name
)
break
# 广东省人民政府
def
guangdong
():
pass
# 贵州省人民政府
def
guizhou
():
url
=
"https://www.guizhou.gov.cn/irs/front/search"
payload
=
"{
\"
tenantId
\"
:
\"
186
\"
,
\"
configTenantId
\"
:
\"\"
,
\"
tenantIds
\"
:
\"\"
,
\"
searchWord
\"
:
\"
REITs
\"
,
\"
historySearchWords
\"
:[
\"
REITs
\"
],
\"
dataTypeId
\"
:
\"
965
\"
,
\"
orderBy
\"
:
\"
related
\"
,
\"
searchBy
\"
:
\"
all
\"
,
\"
appendixType
\"
:
\"\"
,
\"
granularity
\"
:
\"
ALL
\"
,
\"
beginDateTime
\"
:
\"\"
,
\"
endDateTime
\"
:
\"\"
,
\"
isSearchForced
\"
:0,
\"
filters
\"
:[],
\"
pageNo
\"
:1,
\"
pageSize
\"
:9}"
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'291'
,
'Content-Type'
:
'application/json'
,
'Cookie'
:
'SESSION=MGY2NWQ3NjctZTNhZC00OTJhLWIzNGQtMDI1MmQ5MWVlZmNm; _trs_uv=lp15qktj_367_a56u; _trs_ua_s_1=lp15qktj_367_lac; yfx_c_g_u_id_10000921=_ck23111620182819813554574558557; yfx_f_l_v_t_10000921=f_t_1700137108976__r_t_1700137108976__v_t_1700137108976__r_c_0; arialoadData=false'
,
'Host'
:
'www.guizhou.gov.cn'
,
'Origin'
:
'https://www.guizhou.gov.cn'
,
'Referer'
:
'https://www.guizhou.gov.cn/so/search.shtml?tenantId=186&tenantIds=&configTenantId=&searchWord=REITs&dataTypeId=965&sign=6bd8592c-2e19-4f22-ae6d-f129f729e795'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
jsonData
=
policy
.
requestPost
(
headers
,
url
,
payload
)
result_list
=
jsonData
[
'data'
][
'middle'
][
"list"
]
for
datainfo
in
result_list
:
title
=
datainfo
[
'title'
]
publishData
=
datainfo
[
'time'
]
source
=
datainfo
[
'source'
]
summary
=
datainfo
[
'content'
]
newsUrl
=
datainfo
[
'url'
]
soup
=
policy
.
getrequest_soup
(
headers
,
newsUrl
)
# print(soup)
pub_hao
=
soup
.
find
(
'head'
)
.
find
(
'title'
)
print
(
pub_hao
)
pass
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
file_path
=
f
'data/REITs
国家改革发展委员会
.xlsx'
file_path
=
f
'data/REITs
深圳交易所
.xlsx'
wb
=
policy
.
createfile
(
file_path
)
wb
=
policy
.
createfile
(
file_path
)
# reform(wb,file_path)
# reform(wb,file_path)
# shenzhen()
# zhengquanqihuo(wb,file_path)
# zhengquanqihuo(wb,file_path)
sse
(
wb
,
file_path
)
# sse(wb,file_path)
# hebei()
guizhou
()
# zhengquanqihuo()
# zhengquanqihuo()
\ No newline at end of file
comData/noticeReport/公告补采3.py
浏览文件 @
6173700f
"""
"""
...
@@ -291,18 +291,18 @@ def run_threads(num_threads,esMethod):
...
@@ -291,18 +291,18 @@ def run_threads(num_threads,esMethod):
thread
.
join
()
thread
.
join
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# while True
:
for
i
in
range
(
0
,
5
)
:
esMethod
=
EsMethod
()
esMethod
=
EsMethod
()
p
=
0
p
=
0
#
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
result
=
esMethod
.
queryatt
(
index_name
=
esMethod
.
index_name
,
pnum
=
p
)
#
total = result['hits']['total']['value']
total
=
result
[
'hits'
][
'total'
][
'value'
]
#
if total == 0:
if
total
==
0
:
#
log.info('++++已没有数据+++++')
log
.
info
(
'++++已没有数据+++++'
)
#
break
break
start
=
time
.
time
()
start
=
time
.
time
()
num_threads
=
8
num_threads
=
10
run_threads
(
num_threads
,
esMethod
)
run_threads
(
num_threads
,
esMethod
)
log
.
info
(
f
'
8
线程 每个处理200条数据 总耗时{time.time()-start}秒'
)
log
.
info
(
f
'
10
线程 每个处理200条数据 总耗时{time.time()-start}秒'
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论