Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
3bed36dc
提交
3bed36dc
authored
11月 10, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
reits专题数据
上级
3154b028
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
1014 行增加
和
0 行删除
+1014
-0
BaseCore.py
REITs专题数据/BaseCore.py
+556
-0
reits.py
REITs专题数据/reits.py
+458
-0
没有找到文件。
REITs专题数据/BaseCore.py
0 → 100644
浏览文件 @
3bed36dc
# 核心工具包
# 核心工具包
import
os
import
random
import
socket
import
sys
import
time
import
fitz
import
logbook
import
logbook.more
import
pandas
as
pd
import
requests
import
zhconv
import
redis
import
langid
#创建连接池
import
pymysql
from
DBUtils.PooledDB
import
PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from
obs
import
ObsClient
import
fitz
from
urllib.parse
import
unquote
obsClient
=
ObsClient
(
access_key_id
=
'VEHN7D0TJ9316H8AHCAV'
,
# 你的华为云的ak码
secret_access_key
=
'heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY'
,
# 你的华为云的sk
server
=
'https://obs.cn-north-1.myhuaweicloud.com'
# 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
__seq
=
0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx
=
None
cursor
=
None
cnx_
=
None
cursor_
=
None
r
=
None
# agent 池
__USER_AGENT_LIST
=
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)'
,
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19'
,
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9'
,
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5'
,
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9'
,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)'
,
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1'
,
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0'
,
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
,
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0'
,
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1'
,
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2'
,
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1'
,
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)'
,
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)'
,
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3'
,
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13'
,
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1'
,
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
def
__init__
(
self
):
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
self
.
cursor
=
self
.
cnx
.
cursor
()
#11数据库
self
.
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
# 连接到Redis
self
.
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
pool_caiji
=
PooledDB
(
creator
=
pymysql
,
maxconnections
=
5
,
mincached
=
2
,
maxcached
=
5
,
blocking
=
True
,
host
=
'114.115.159.144'
,
port
=
3306
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
database
=
'caiji'
,
charset
=
'utf8mb4'
)
def
close
(
self
):
try
:
self
.
cursor
.
close
()
self
.
cnx
.
close
()
except
:
pass
# 计算耗时
def
getTimeCost
(
self
,
start
,
end
):
seconds
=
int
(
end
-
start
)
m
,
s
=
divmod
(
seconds
,
60
)
h
,
m
=
divmod
(
m
,
60
)
if
(
h
>
0
):
return
"
%
d小时
%
d分钟
%
d秒"
%
(
h
,
m
,
s
)
elif
(
m
>
0
):
return
"
%
d分钟
%
d秒"
%
(
m
,
s
)
elif
(
seconds
>
0
):
return
"
%
d秒"
%
(
s
)
else
:
ms
=
int
((
end
-
start
)
*
1000
)
return
"
%
d毫秒"
%
(
ms
)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def
getNowTime
(
self
,
type
):
now_time
=
""
if
type
==
1
:
now_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
type
==
2
:
now_time
=
time
.
strftime
(
"
%
y
%
m
%
d
%
H
%
M
%
S"
)
if
type
==
3
:
now_time
=
int
(
time
.
time
()
*
1000
)
return
now_time
# 获取流水号
def
getNextSeq
(
self
):
self
.
__seq
+=
1
if
self
.
__seq
>
1000
:
self
.
__seq
=
0
return
self
.
getNowTime
(
2
)
+
str
(
self
.
__seq
)
.
zfill
(
3
)
# 获取信用代码
def
getNextXydm
(
self
):
self
.
__seq
+=
1
if
self
.
__seq
>
1000
:
self
.
__seq
=
0
return
"ZZSN"
+
self
.
getNowTime
(
2
)
+
str
(
self
.
__seq
)
.
zfill
(
3
)
# 日志格式
def
logFormate
(
self
,
record
,
handler
):
formate
=
"[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}"
.
format
(
date
=
record
.
time
,
# 日志时间
level
=
record
.
level_name
,
# 日志等级
filename
=
os
.
path
.
split
(
record
.
filename
)[
-
1
],
# 文件名
func_name
=
record
.
func_name
,
# 函数名
lineno
=
record
.
lineno
,
# 行号
msg
=
record
.
message
# 日志内容
)
return
formate
# 获取logger
def
getLogger
(
self
,
fileLogFlag
=
True
,
stdOutFlag
=
True
):
dirname
,
filename
=
os
.
path
.
split
(
os
.
path
.
abspath
(
sys
.
argv
[
0
]))
dirname
=
os
.
path
.
join
(
dirname
,
"logs"
)
filename
=
filename
.
replace
(
".py"
,
""
)
+
".log"
if
not
os
.
path
.
exists
(
dirname
):
os
.
mkdir
(
dirname
)
logbook
.
set_datetime_format
(
'local'
)
logger
=
logbook
.
Logger
(
filename
)
logger
.
handlers
=
[]
if
fileLogFlag
:
# 日志输出到文件
logFile
=
logbook
.
TimedRotatingFileHandler
(
os
.
path
.
join
(
dirname
,
filename
),
date_format
=
'
%
Y-
%
m-
%
d'
,
bubble
=
True
,
encoding
=
'utf-8'
)
logFile
.
formatter
=
self
.
logFormate
logger
.
handlers
.
append
(
logFile
)
if
stdOutFlag
:
# 日志打印到屏幕
logStd
=
logbook
.
more
.
ColorizedStderrHandler
(
bubble
=
True
)
logStd
.
formatter
=
self
.
logFormate
logger
.
handlers
.
append
(
logStd
)
return
logger
# 获取随机的userAgent
def
getRandomUserAgent
(
self
):
return
random
.
choice
(
self
.
__USER_AGENT_LIST
)
# 获取代理
def
get_proxy
(
self
):
sql
=
"select proxy from clb_proxy"
self
.
cursor
.
execute
(
sql
)
proxy_lists
=
self
.
cursor
.
fetchall
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"HTTP"
:
proxyMeta
,
"HTTPS"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
[
random
.
randint
(
0
,
3
)]
#字符串截取
def
getSubStr
(
self
,
str
,
beginStr
,
endStr
):
if
beginStr
==
''
:
pass
else
:
begin
=
str
.
rfind
(
beginStr
)
if
begin
==-
1
:
begin
=
0
str
=
str
[
begin
:]
if
endStr
==
''
:
pass
else
:
end
=
str
.
rfind
(
endStr
)
if
end
==-
1
:
pass
else
:
str
=
str
[
0
:
end
+
1
]
return
str
# 繁体字转简体字
def
hant_2_hans
(
self
,
hant_str
:
str
):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return
zhconv
.
convert
(
hant_str
,
'zh-hans'
)
# 判断字符串里是否含数字
def
str_have_num
(
self
,
str_num
):
panduan
=
False
for
str_1
in
str_num
:
ppp
=
str_1
.
isdigit
()
if
ppp
:
panduan
=
ppp
return
panduan
#检测语言
def
detect_language
(
self
,
text
):
# 使用langid.py判断文本的语言
result
=
langid
.
classify
(
text
)
if
result
==
''
:
return
'cn'
if
result
[
0
]
==
''
:
return
'cn'
return
result
[
0
]
#追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
combined_data
=
existing_data
.
append
(
new_data
,
ignore_index
=
True
)
# 将结果写入到xlsx文件
combined_data
.
to_excel
(
filename
,
index
=
False
)
# return combined_data
#解析word文件页数
# def doc_page(self,file_path):
# doc = Document(file_path)
# return len(doc.sections)
def
secrchATT
(
self
,
item_id
,
file_name
,
type_id
,
order_by
):
sel_sql
=
'''select id from clb_sys_attachment where item_id =
%
s and name =
%
s and type_id=
%
s and order_by=
%
s '''
self
.
cursor_
.
execute
(
sel_sql
,
(
item_id
,
file_name
,
type_id
,
order_by
))
selects
=
self
.
cursor_
.
fetchone
()
return
selects
#插入到att表 返回附件id
def
tableUpdate
(
self
,
retData
,
com_name
,
file_name
,
num
,
pub_time
):
item_id
=
retData
[
'item_id'
]
type_id
=
retData
[
'type_id'
]
group_name
=
retData
[
'group_name'
]
path
=
retData
[
'path'
]
full_path
=
retData
[
'full_path'
]
category
=
retData
[
'category'
]
file_size
=
retData
[
'file_size'
]
status
=
retData
[
'status'
]
create_by
=
retData
[
'create_by'
]
page_size
=
retData
[
'page_size'
]
create_time
=
retData
[
'create_time'
]
order_by
=
num
Upsql
=
'''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
values
=
(
file_name
,
type_id
,
item_id
,
group_name
,
path
,
full_path
,
category
,
file_size
,
order_by
,
status
,
create_by
,
create_time
,
path
,
'zzsn'
,
pub_time
)
self
.
cursor_
.
execute
(
Upsql
,
values
)
# 插入
self
.
cnx_
.
commit
()
# 提交
self
.
getLogger
()
.
info
(
"更新完成:{}"
.
format
(
Upsql
))
selects
=
self
.
secrchATT
(
item_id
,
file_name
,
type_id
,
order_by
)
id
=
selects
[
0
]
return
id
,
full_path
# 获取文件大小
def
convert_size
(
self
,
size_bytes
):
# 定义不同单位的转换值
units
=
[
'bytes'
,
'KB'
,
'MB'
,
'GB'
,
'TB'
]
i
=
0
while
size_bytes
>=
1024
and
i
<
len
(
units
)
-
1
:
size_bytes
/=
1024
i
+=
1
return
f
"{size_bytes:.2f} {units[i]}"
def
uptoOBS
(
self
,
file_href
,
item_id
,
file_name
):
headers
=
{}
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
retData
=
{
'state'
:
False
,
'type_id'
:
7
,
'item_id'
:
item_id
,
'group_name'
:
''
,
'path'
:
''
,
'full_path'
:
''
,
'category'
:
category
,
'file_size'
:
''
,
'status'
:
1
,
'create_by'
:
'XueLingKun'
,
'create_time'
:
''
,
'page_size'
:
''
,
'content'
:
''
}
headers
[
'User-Agent'
]
=
self
.
getRandomUserAgent
()
for
i
in
range
(
0
,
3
):
try
:
response
=
requests
.
get
(
file_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
file_size
=
int
(
response
.
headers
.
get
(
'Content-Length'
))
break
except
:
time
.
sleep
(
3
)
continue
page_size
=
0
for
i
in
range
(
0
,
3
):
try
:
# name = file_name
if
category
in
file_name
:
pass
else
:
file_name
=
file_name
+
'.'
+
category
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
file_name
,
content
=
response
.
content
)
break
except
:
time
.
sleep
(
3
)
continue
if
page_size
<
1
:
# pdf解析失败
# print(f'======pdf解析失败=====')
return
retData
else
:
try
:
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'full_path'
]
=
unquote
(
result
[
'body'
][
'objectUrl'
])
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
retData
[
'create_time'
]
=
time_now
except
Exception
as
e
:
print
(
f
'error:{e}'
)
return
retData
return
retData
REITs专题数据/reits.py
0 → 100644
浏览文件 @
3bed36dc
import
os
import
os
import
requests
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
import
time
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
urllib.parse
import
urljoin
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
filepath
=
"data/"
class
Policy
():
def
getrequest_soup
(
self
,
headers
,
url
):
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
result
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
result
def
getrequest_json
(
self
,
headers
,
url
):
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
result
=
req
.
json
()
return
result
def
requestPost
(
self
,
headers
,
url
,
payload
):
req
=
requests
.
post
(
headers
=
headers
,
url
=
url
,
data
=
payload
)
data_json
=
req
.
json
()
return
data_json
def
createDriver
(
self
):
chrome_driver
=
r'D:\cmd100\chromedriver.exe'
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
return
driver
def
deletep
(
self
,
soup
,
i
,
tag
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的P标签并删除
tags
=
soup
.
find_all
(
tag
,
{
attribute_to_delete
:
value_to_delete
})
for
tag
in
tags
[:
i
]:
tag
.
decompose
()
def
deletek
(
self
,
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
def
paserUrl
(
self
,
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
def
attuributefile
(
self
,
file_name
,
file_href
,
num
,
publishDate
):
# 下载附件到本地,并上传文件服务器
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'9999'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
return
''
,
''
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'RETIs文件'
,
file_name
,
num
,
publishDate
)
return
att_id
,
full_path
def
downloadfile
(
self
,
file_name
,
file_href
,
path
):
response
=
requests
.
get
(
file_href
)
with
open
(
path
,
"wb"
)
as
file
:
file
.
write
(
response
.
content
)
pass
policy
=
Policy
()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def
reform
():
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'fwfx.ndrc.gov.cn'
,
'Origin'
:
'https://www.ndrc.gov.cn'
,
'Referer'
:
'https://www.ndrc.gov.cn/'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-site'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
url
=
'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page=1&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
result
=
policy
.
getrequest_json
(
headers
,
url
)
data_list
=
result
[
'data'
][
'resultList'
]
num
=
0
for
info
in
data_list
:
num
+=
1
# info = data_list[1]
publishDate
=
info
[
'docDate'
]
title
=
info
[
'title'
]
summary
=
info
[
'summary'
]
newsUrl
=
info
[
'url'
]
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'Hm_lvt_6c8165462fd93121348afe212168341f=1699338341; yfx_c_g_u_id_10005970=_ck23110714254113251712738304141; http_waf_cookie=05e8486c-c47f-4927291823a10f5e24ceed45b1eaa3eb7354; SF_cookie_3=21321202; Hm_lpvt_6c8165462fd93121348afe212168341f=1699422316; yfx_f_l_v_t_10005970=f_t_1699338341317__r_t_1699412780356__v_t_1699422316031__r_c_1'
,
'Host'
:
'www.ndrc.gov.cn'
,
'Referer'
:
'https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt='
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
# print(newssoup)
try
:
pubHao
=
''
source
=
''
try
:
# article_con article_con_title
contentWithTag
=
newssoup
.
select
(
'div[class="article_con article_con_notitle"]'
)[
0
]
except
:
try
:
contentWithTag
=
newssoup
.
select
(
'div[class="article_con article_con_title"]'
)[
0
]
except
:
continue
try
:
pubHao_
=
newssoup
.
select
(
'div[class="article_con article_con_notitle"]>span'
)[
0
]
.
text
if
'〔'
in
pubHao_
:
pubHao
=
pubHao_
except
:
pass
policy
.
deletep
(
contentWithTag
,
3
,
'div'
,
'style'
,
'text-align: center;'
)
policy
.
deletek
(
contentWithTag
)
content
=
contentWithTag
.
text
try
:
source
=
newssoup
.
select
(
'div[class="ly laiyuantext"]>span'
)[
0
]
.
text
except
:
pass
dic_info
=
{
'title'
:
title
,
'summary'
:
summary
,
'publishDate'
:
publishDate
,
'source'
:
source
,
'pub_hao'
:
pubHao
,
'contentWithTag'
:
contentWithTag
,
'content'
:
content
}
print
(
dic_info
)
except
:
print
(
newsUrl
)
#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
def
zhengquanqihuo
():
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'140'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
'JSESSIONID=D18F2DF64366325AC0A50E09AA98EE84'
,
'Host'
:
'neris.csrc.gov.cn'
,
'Origin'
:
'https://neris.csrc.gov.cn'
,
'Referer'
:
'https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
url
=
'https://neris.csrc.gov.cn/falvfagui/multipleFindController/solrSearch'
payload
=
{
'pageNo'
:
'1'
,
'secFutrsLawName'
:
''
,
'body'
:
'"REITs"'
,
'lawPubOrgName'
:
''
,
'titleQry'
:
''
,
'keyQry'
:
'REITs、'
,
'fileno'
:
''
,
'pubDate_from'
:
''
,
'pubDate_thru'
:
''
,
'nbr'
:
'1'
,
'isLike'
:
'0'
}
result
=
policy
.
requestPost
(
headers
,
url
,
payload
)
pageUtil
=
result
[
'pageUtil'
]
total
=
pageUtil
[
'rowCount'
]
page_size
=
pageUtil
[
'pageSize'
]
Max_page
=
int
(
total
/
page_size
)
for
page
in
range
(
0
,
Max_page
):
payload_page
=
{
'pageNo'
:
page
+
1
,
'secFutrsLawName'
:
''
,
'body'
:
'"REITs"'
,
'lawPubOrgName'
:
''
,
'titleQry'
:
''
,
'keyQry'
:
'REITs、'
,
'fileno'
:
''
,
'pubDate_from'
:
''
,
'pubDate_thru'
:
''
,
'nbr'
:
'1'
,
'isLike'
:
'0'
}
data_page
=
policy
.
requestPost
(
headers
,
url
,
payload_page
)
info_list
=
data_page
[
'pageUtil'
][
'pageList'
]
for
info
in
info_list
:
title
=
info
[
'secFutrsLawName'
]
pubHao
=
info
[
'fileno'
]
source
=
info
[
'lawPubOrgName'
]
publish_
=
datetime
.
strptime
(
info
[
'secFutrsLawVersion'
],
"
%
Y
%
m
%
d"
)
publishDate
=
datetime
.
strftime
(
publish_
,
"
%
Y-
%
m-
%
d"
)
# print(publishDate)
secFutrsLawId
=
info
[
'secFutrsLawId'
]
newsUrl
=
f
'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
browser
=
policy
.
createDriver
()
browser
.
get
(
newsUrl
)
time
.
sleep
(
1
)
page_source
=
browser
.
page_source
newssoup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
# print(newssoup)
contentWithTag
=
newssoup
.
find
(
'div'
,
class_
=
'law_text mainBody catalog'
)
content
=
contentWithTag
.
text
print
(
content
)
dic_info
=
{
'title'
:
title
,
'publishDate'
:
publishDate
,
'source'
:
source
,
'pub_hao'
:
pubHao
,
'contentWithTag'
:
contentWithTag
,
'content'
:
content
}
print
(
dic_info
)
#深圳交易所 http://www.szse.cn/lawrules/index.html
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def
sse
():
url
=
'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
headers
=
{
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'ba17301551dcbaf9_gdp_user_key=; ba17301551dcbaf9_gdp_session_id=878c2669-93f0-43bd-91c1-cc30ca7136ef; gdp_user_id=gioenc-9a36dgc8
%2
C6b5d
%2
C5265
%2
Ccdc5
%2
C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_sequence_ids={
%22
globalKey
%22
:28
%2
C
%22
VISIT
%22
:2
%2
C
%22
PAGE
%22
:2
%2
C
%22
CUSTOM
%22
:17
%2
C
%22
VIEW_CLICK
%22
:10}'
,
'Host'
:
'query.sse.com.cn'
,
'Referer'
:
'http://www.sse.com.cn/'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
result
=
policy
.
getrequest_json
(
headers
,
url
)
total_page
=
result
[
'data'
][
'totalPage'
]
for
page
in
range
(
0
,
int
(
total_page
)):
url_page
=
f
'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title
%2
Cpaper_content&channelId=10001&channelCode=8640
%2
C8641
%2
C8642
%2
C8643
%2
C8644
%2
C8645
%2
C8646
%2
C8647
%2
C8648
%2
C8649
%2
C8650
%2
C8651
%2
C8652
%2
C8653
%2
C8654
%2
C8655
%2
C8656
%2
C8657
%2
C8658
%2
C8659
%2
C8660
%2
C8661
%2
C8685
%2
C9348
%2
C12632
%2
C12768
%2
C12769
%2
C12770
%2
C12771
%2
C12772
%2
C12773
%2
C12774
%2
C12775
%2
C12776
%2
C12777
%2
C12778
%2
C12779
%2
C12780
%2
C12781
%2
C12782
%2
C12783
%2
C12784
%2
C12785
%2
C12786
%2
C12787
%2
C12788
%2
C12789
%2
C12790
%2
C12791
%2
C12792
%2
C12793
%2
C12794
%2
C12795
%2
C12796
%2
C12797
%2
C12798
%2
C12799
%2
C12800
%2
C12801
%2
C12802
%2
C12803
%2
C12804
%2
C12805
%2
C12806
%2
C12807
%2
C12808
%2
C12809
%2
C12810
%2
C12811
%2
C12812
%2
C13061
%2
C13282
%2
C13283
%2
C13284
%2
C13285
%2
C13286
%2
C13287
%2
C13288
%2
C13289
%2
C13294
%2
C13364
%2
C13365
%2
C13366
%2
C13367
%2
C14595
%2
C14596
%2
C14597
%2
C14598
%2
C14599
%2
C14600
%2
C14601
%2
C14602
%2
C14603
%2
C14604
%2
C14605
%2
C14606&trackId=50619067167713018335655119683810&_=1699508921761'
data
=
policy
.
getrequest_json
(
headers
,
url_page
)
newslist
=
data
[
'data'
][
'knowledgeList'
]
# print(newslist)
for
news
in
newslist
[:
1
]:
title
=
news
[
'title'
]
publishDate
=
news
[
'createTime'
]
newsUrl
=
'http://www.sse.com.cn'
+
news
[
'extend'
][
4
][
'value'
]
# print(newsUrl)
summary
=
news
[
'rtfContent'
]
source
=
news
[
'spaceName'
]
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'ba17301551dcbaf9_gdp_user_key=; ba17301551dcbaf9_gdp_session_id=878c2669-93f0-43bd-91c1-cc30ca7136ef; gdp_user_id=gioenc-9a36dgc8
%2
C6b5d
%2
C5265
%2
Ccdc5
%2
C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; VISITED_MENU=
%5
B
%228307%22%5
D; seecookie=REITs; home-search-scroll=; ba17301551dcbaf9_gdp_sequence_ids={
%22
globalKey
%22
:33
%2
C
%22
VISIT
%22
:2
%2
C
%22
PAGE
%22
:3
%2
C
%22
CUSTOM
%22
:18
%2
C
%22
VIEW_CLICK
%22
:13}'
,
'Host'
:
'www.sse.com.cn'
,
'Referer'
:
'http://www.sse.com.cn/home/search/index.shtml?webswd=REITs'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
newssoup
=
policy
.
getrequest_soup
(
header
,
newsUrl
)
# print(newssoup)
content_
=
newssoup
.
find
(
'div'
,
class_
=
'allZoom'
)
# print(content_)
# # 将链接替换为绝对路径
contentWithTag
=
policy
.
paserUrl
(
content_
,
newsUrl
)
pubHao
=
contentWithTag
.
find
(
'p'
,
style
=
'text-align: center;'
)
.
text
.
strip
(
' '
)
if
'〔'
in
pubHao
:
pass
else
:
pubHao
=
''
# print(contentWithTag)
content
=
contentWithTag
.
text
fujian_list
=
contentWithTag
.
find_all
(
'a'
)
id_list
=
[]
for
fujian
in
fujian_list
:
try
:
num
=
1
file_href
=
fujian
[
'href'
]
file_name
=
fujian
.
text
.
strip
(
' '
)
# 下载附件到本地,并上传文件服务器
att_id
,
full_path
=
policy
.
attuributefile
(
file_name
,
file_href
,
num
,
publishDate
)
num
+=
1
if
att_id
and
full_path
:
id_list
.
append
(
att_id
)
dic_info
=
{
'attachmentIds'
:
id_list
,
'title'
:
title
,
'summary'
:
summary
,
'publishDate'
:
publishDate
,
'source'
:
source
,
'pub_hao'
:
pubHao
,
'contentWithTag'
:
contentWithTag
,
'content'
:
content
}
except
:
continue
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def
beijing
():
url
=
'https://www.beijing.gov.cn/so/ss/query/s'
payload
=
{
'siteCode'
:
'1100000088'
,
'tab'
:
'zcfg'
,
'qt'
:
'REITs'
,
'sort'
:
'relevance'
,
'keyPlace'
:
'0'
,
'locationCode'
:
'110000000000'
,
'page'
:
'1'
,
'pageSize'
:
'20'
,
'ie'
:
'89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'148'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=
%5
B
%22%22%2
C
%22%22%2
C1699515166
%2
C
%22
https
%3
A
%2
F
%2
Fdocs.qq.com
%2
F
%22%5
D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF'
,
'Host'
:
'www.beijing.gov.cn'
,
'Origin'
:
'https://www.beijing.gov.cn'
,
'Referer'
:
'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
result
=
policy
.
requestPost
(
headers
,
url
,
payload
)
total
=
result
[
'totalHits'
]
page_size
=
result
[
'currentHits'
]
Max_page
=
int
(
total
/
page_size
)
for
page
in
range
(
0
,
Max_page
):
payload_page
=
{
'siteCode'
:
'1100000088'
,
'tab'
:
'zcfg'
,
'qt'
:
'REITs'
,
'sort'
:
'relevance'
,
'keyPlace'
:
'0'
,
'locationCode'
:
'110000000000'
,
'page'
:
page
+
1
,
'pageSize'
:
'20'
,
'ie'
:
'89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data
=
policy
.
requestPost
(
headers
,
url
,
payload_page
)
info_list
=
data
[
'resultDocs'
]
# print(info_list)
for
info_
in
info_list
:
info
=
info_
[
'data'
]
title
=
info
[
'titleO'
]
titleLabel
=
info
[
'titleLabel'
][
'value'
]
publishDate
=
info
[
'docDate'
]
# source = info['siteLabel']['value']
newsUrl
=
info
[
'url'
]
if
titleLabel
==
'政策解读'
:
newssoup
=
policy
.
getrequest_soup
(
headers
,
newsUrl
)
print
(
newssoup
)
contentWithTag
=
newssoup
.
find
(
'div'
,
id
=
'mainText'
)
content
=
contentWithTag
.
text
source
=
newssoup
.
select
(
'p[class="fl"]>span'
)[
1
]
.
replace
(
'来源:'
,
''
)
formatRows
=
info
[
'formatRows'
]
num
=
1
for
row
in
formatRows
:
for
col
in
row
[
'col'
]:
name
=
col
[
'text'
]
if
name
==
'相关附件'
:
value
=
col
[
'value'
]
file_href
=
value
.
keys
()
file_name
=
value
.
values
()
# 附件上传
policy
.
attuributefile
(
file_name
,
file_href
,
num
,
publishDate
)
num
+=
1
value
=
col
[
'value'
][
0
]
dic_info
[
name
]
=
value
dic_info
=
{
'title'
:
title
,
'publishDate'
:
publishDate
,
'source'
:
source
,
'newsUrl'
:
newsUrl
,
'file_href'
:
file_href
}
# print(dic_info)
# break
# reform()
# zhengquanqihuo()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论