Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
b923d30f
提交
b923d30f
authored
10月 27, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中国100大企业基本信息
上级
1bb5b282
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
731 行增加
和
58 行删除
+731
-58
BaseCore.py
comData/newlist/china100/BaseCore.py
+611
-0
baseinfo_start.bat
comData/newlist/china100/baseinfo_start.bat
+6
-0
china100.py
comData/newlist/china100/china100.py
+74
-45
getQccId.py
comData/newlist/china100/getQccId.py
+40
-13
没有找到文件。
comData/newlist/china100/BaseCore.py
0 → 100644
浏览文件 @
b923d30f
# 核心工具包
import
os
import
random
import
socket
import
sys
import
time
import
fitz
import
logbook
import
logbook.more
import
pandas
as
pd
import
requests
import
zhconv
import
pymysql
import
redis
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
openpyxl
import
Workbook
import
langid
#创建连接池
import
pymysql
from
pymysql
import
connections
from
DBUtils.PooledDB
import
PooledDB
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
__seq
=
0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx
=
None
cursor
=
None
cnx_
=
None
cursor_
=
None
r
=
None
# agent 池
__USER_AGENT_LIST
=
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)'
,
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19'
,
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9'
,
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5'
,
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9'
,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)'
,
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1'
,
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0'
,
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
,
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0'
,
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1'
,
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2'
,
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2'
,
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1'
,
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)'
,
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)'
,
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3'
,
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)'
,
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13'
,
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13'
,
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0'
,
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13'
,
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8'
,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1'
,
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
def
__init__
(
self
):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
self
.
cursor
=
self
.
cnx
.
cursor
()
#11数据库
self
.
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
# 连接到Redis
self
.
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
pool_caiji
=
PooledDB
(
creator
=
pymysql
,
maxconnections
=
5
,
mincached
=
2
,
maxcached
=
5
,
blocking
=
True
,
host
=
'114.115.159.144'
,
port
=
3306
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
database
=
'caiji'
,
charset
=
'utf8mb4'
)
def
close
(
self
):
try
:
self
.
cursor
.
close
()
self
.
cnx
.
close
()
except
:
pass
# 计算耗时
def
getTimeCost
(
self
,
start
,
end
):
seconds
=
int
(
end
-
start
)
m
,
s
=
divmod
(
seconds
,
60
)
h
,
m
=
divmod
(
m
,
60
)
if
(
h
>
0
):
return
"
%
d小时
%
d分钟
%
d秒"
%
(
h
,
m
,
s
)
elif
(
m
>
0
):
return
"
%
d分钟
%
d秒"
%
(
m
,
s
)
elif
(
seconds
>
0
):
return
"
%
d秒"
%
(
s
)
else
:
ms
=
int
((
end
-
start
)
*
1000
)
return
"
%
d毫秒"
%
(
ms
)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def
getNowTime
(
self
,
type
):
now_time
=
""
if
type
==
1
:
now_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
type
==
2
:
now_time
=
time
.
strftime
(
"
%
y
%
m
%
d
%
H
%
M
%
S"
)
if
type
==
3
:
now_time
=
int
(
time
.
time
()
*
1000
)
return
now_time
# 获取流水号
def
getNextSeq
(
self
):
self
.
__seq
+=
1
if
self
.
__seq
>
1000
:
self
.
__seq
=
0
return
self
.
getNowTime
(
2
)
+
str
(
self
.
__seq
)
.
zfill
(
3
)
# 获取信用代码
def
getNextXydm
(
self
):
self
.
__seq
+=
1
if
self
.
__seq
>
1000
:
self
.
__seq
=
0
return
"ZZSN"
+
self
.
getNowTime
(
2
)
+
str
(
self
.
__seq
)
.
zfill
(
3
)
# 日志格式
def
logFormate
(
self
,
record
,
handler
):
formate
=
"[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}"
.
format
(
date
=
record
.
time
,
# 日志时间
level
=
record
.
level_name
,
# 日志等级
filename
=
os
.
path
.
split
(
record
.
filename
)[
-
1
],
# 文件名
func_name
=
record
.
func_name
,
# 函数名
lineno
=
record
.
lineno
,
# 行号
msg
=
record
.
message
# 日志内容
)
return
formate
# 获取logger
def
getLogger
(
self
,
fileLogFlag
=
True
,
stdOutFlag
=
True
):
dirname
,
filename
=
os
.
path
.
split
(
os
.
path
.
abspath
(
sys
.
argv
[
0
]))
dirname
=
os
.
path
.
join
(
dirname
,
"logs"
)
filename
=
filename
.
replace
(
".py"
,
""
)
+
".log"
if
not
os
.
path
.
exists
(
dirname
):
os
.
mkdir
(
dirname
)
logbook
.
set_datetime_format
(
'local'
)
logger
=
logbook
.
Logger
(
filename
)
logger
.
handlers
=
[]
if
fileLogFlag
:
# 日志输出到文件
logFile
=
logbook
.
TimedRotatingFileHandler
(
os
.
path
.
join
(
dirname
,
filename
),
date_format
=
'
%
Y-
%
m-
%
d'
,
bubble
=
True
,
encoding
=
'utf-8'
)
logFile
.
formatter
=
self
.
logFormate
logger
.
handlers
.
append
(
logFile
)
if
stdOutFlag
:
# 日志打印到屏幕
logStd
=
logbook
.
more
.
ColorizedStderrHandler
(
bubble
=
True
)
logStd
.
formatter
=
self
.
logFormate
logger
.
handlers
.
append
(
logStd
)
return
logger
# 获取随机的userAgent
def
getRandomUserAgent
(
self
):
return
random
.
choice
(
self
.
__USER_AGENT_LIST
)
# 获取代理
def
get_proxy
(
self
):
sql
=
"select proxy from clb_proxy"
self
.
cursor
.
execute
(
sql
)
proxy_lists
=
self
.
cursor
.
fetchall
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"HTTP"
:
proxyMeta
,
"HTTPS"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
[
random
.
randint
(
0
,
3
)]
#字符串截取
def
getSubStr
(
self
,
str
,
beginStr
,
endStr
):
if
beginStr
==
''
:
pass
else
:
begin
=
str
.
rfind
(
beginStr
)
if
begin
==-
1
:
begin
=
0
str
=
str
[
begin
:]
if
endStr
==
''
:
pass
else
:
end
=
str
.
rfind
(
endStr
)
if
end
==-
1
:
pass
else
:
str
=
str
[
0
:
end
+
1
]
return
str
# 繁体字转简体字
def
hant_2_hans
(
self
,
hant_str
:
str
):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return
zhconv
.
convert
(
hant_str
,
'zh-hans'
)
# 判断字符串里是否含数字
def
str_have_num
(
self
,
str_num
):
panduan
=
False
for
str_1
in
str_num
:
ppp
=
str_1
.
isdigit
()
if
ppp
:
panduan
=
ppp
return
panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
item
=
self
.
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
# 获得脚本进程PID
def
getPID
(
self
):
PID
=
os
.
getpid
()
return
PID
# 获取本机IP
def
getIP
(
self
):
IP
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
IP
def
mkPath
(
self
,
path
):
folder
=
os
.
path
.
exists
(
path
)
if
not
folder
:
# 判断是否存在文件夹如果不存在则创建为文件夹
os
.
makedirs
(
path
)
# makedirs 创建文件时如果路径不存在会创建这个路径
else
:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def
buildDriver
(
self
,
path
,
headless
=
True
):
service
=
Service
(
path
)
chrome_options
=
webdriver
.
ChromeOptions
()
if
headless
:
chrome_options
.
add_argument
(
'--headless'
)
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
chrome_options
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
chrome_options
.
add_argument
(
'lang=zh-CN,zh,zh-TW,en-US,en'
)
chrome_options
.
add_argument
(
'user-agent='
+
self
.
getRandomUserAgent
())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
service
=
service
)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return
driver
# 根据社会信用代码获取企业信息
def
getInfomation
(
self
,
social_code
):
data
=
[]
try
:
sql
=
f
"SELECT * FROM China100 WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn
=
self
.
pool_caiji
.
connection
()
cursor
=
conn
.
cursor
()
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchone
()
conn
.
commit
()
data
=
list
(
data
)
cursor
.
close
()
conn
.
close
()
except
:
log
=
self
.
getLogger
()
log
.
info
(
'=========数据库操作失败========'
)
return
data
# 更新企业采集次数
def
updateRun
(
self
,
social_code
,
runType
,
count
):
try
:
sql_update
=
f
"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn
=
self
.
pool_caiji
.
connection
()
cursor
=
conn
.
cursor
()
cursor
.
execute
(
sql_update
)
conn
.
commit
()
cursor
.
close
()
conn
.
close
()
except
:
log
=
self
.
getLogger
()
log
.
info
(
'======更新数据库失败======'
)
# 保存日志入库
def
recordLog
(
self
,
xydm
,
taskType
,
state
,
takeTime
,
url
,
e
):
try
:
createTime
=
self
.
getNowTime
(
1
)
ip
=
self
.
getIP
()
pid
=
self
.
getPID
()
sql
=
"INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
[
xydm
,
taskType
,
state
,
takeTime
,
url
,
createTime
,
ip
,
pid
,
e
]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn
=
self
.
pool_caiji
.
connection
()
cursor
=
cnn
.
cursor
()
cursor
.
execute
(
sql
,
values
)
cnn
.
commit
()
cursor
.
close
()
cnn
.
close
()
except
:
log
=
self
.
getLogger
()
log
.
info
(
'======保存日志失败====='
)
#获取企查查token
def
GetToken
(
self
):
#获取企查查token
query
=
"select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self
.
cursor
.
execute
(
query
)
token_list
=
self
.
cursor
.
fetchall
()
self
.
cnx
.
commit
()
try
:
token
=
token_list
[
random
.
randint
(
0
,
len
(
token_list
)
-
1
)][
0
]
except
:
token
=
''
return
token
# 删除失效的token
def
delete_token
(
self
,
token
):
deletesql
=
f
"delete from QCC_token where token='{token}' "
self
.
cursor
.
execute
(
deletesql
)
self
.
cnx
.
commit
()
#获取天眼查token
def
GetTYCToken
(
self
):
query
=
'select token from TYC_token'
self
.
cursor
.
execute
(
query
)
token
=
self
.
cursor
.
fetchone
()[
0
]
self
.
cnx
.
commit
()
return
token
#检测语言
def
detect_language
(
self
,
text
):
# 使用langid.py判断文本的语言
result
=
langid
.
classify
(
text
)
if
result
==
''
:
return
'cn'
if
result
[
0
]
==
''
:
return
'cn'
return
result
[
0
]
#追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
combined_data
=
existing_data
.
append
(
new_data
,
ignore_index
=
True
)
# 将结果写入到xlsx文件
combined_data
.
to_excel
(
filename
,
index
=
False
)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def
rePutIntoR
(
self
,
key
,
item
):
self
.
r
.
rpush
(
key
,
item
)
#增加计数器的值并返回增加后的值
def
incrSet
(
self
,
key
):
# 增加计数器的值并返回增加后的值
new_value
=
self
.
r
.
incr
(
key
)
print
(
"增加后的值:"
,
new_value
)
return
new_value
#获取key剩余的过期时间
def
getttl
(
self
,
key
):
# 获取key的剩余过期时间
ttl
=
self
.
r
.
ttl
(
key
)
print
(
"剩余过期时间:"
,
ttl
)
# 判断key是否已过期
if
ttl
<
0
:
# key已过期,将key的值重置为0
self
.
r
.
set
(
key
,
0
)
self
.
r
.
expire
(
key
,
3600
)
time
.
sleep
(
2
)
comData/newlist/china100/baseinfo_start.bat
0 → 100644
浏览文件 @
b923d30f
title dujiaoshoubaseinfo
call activate
call conda activate zzsn@3.8.0
python baseinfo_dujiaoshou.py
pause
\ No newline at end of file
comData/
dfcfwGpdm/NQenterprise/NQbase_info
.py
→
comData/
newlist/china100/china100
.py
浏览文件 @
b923d30f
...
...
@@ -7,16 +7,17 @@ import requests
import
json
from
kafka
import
KafkaProducer
from
base.
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
from
getQccId
import
find_id_by_name
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
# 通过企查查id获取企业基本信息
def
info_by_id
(
com_id
,
com_name
,
gpdm
):
def
info_by_id
(
com_id
,
com_name
):
aa_dict_list
=
[]
t
=
str
(
int
(
time
.
time
())
*
1000
)
...
...
@@ -31,7 +32,7 @@ def info_by_id(com_id,com_name,gpdm):
result_dict
=
resp_dict
[
'result'
][
'Company'
]
except
:
log
.
info
(
com_name
+
":获取失败===========重新放入redis"
)
baseCore
.
rePutIntoR
(
'
EnterpriseIpo:nq_gpdm'
,
gpdm
)
baseCore
.
rePutIntoR
(
'
china100:baseinfo'
,
com_name
)
return
aa_dict_list
company_name
=
result_dict
[
'Name'
]
...
...
@@ -306,12 +307,12 @@ def info_by_id(com_id,com_name,gpdm):
}
aa_dict_list
.
append
(
aa_dict
)
print
(
company_name
+
":爬取完成"
)
log
.
info
(
company_name
+
":爬取完成"
)
return
aa_dict_list
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查'
taskType
=
'基本信息/企查查
/中国100强
'
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
...
...
@@ -323,65 +324,97 @@ if __name__ == '__main__':
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
list_weicha
=
[]
name_list
=
[]
#从redis里拿数据
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
'b4eb43143abdcf395f1335f322ca29e5'
list_weicha
=
[]
list_all_info
=
[]
name_list
=
[]
token
=
baseCore
.
GetToken
()
dataList
=
[]
if
token
:
pass
else
:
log
.
info
(
'==========已无token=========='
)
time
.
sleep
(
30
)
continue
# list_all_info = []
start_time
=
time
.
time
()
# 获取企业信息
# com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code
=
'873349'
if
'.NQ'
in
com_code
:
com_code1
=
com_code
else
:
com_code1
=
com_code
+
'.NQ'
social_code
=
baseCore
.
redicPullData
(
'china100:baseinfo'
)
company_id
=
find_id_by_name
(
start_time
,
token
,
com_code
)
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
com_code
)
log
.
info
(
'-----已重新放入redis-----'
)
# com_name = '卓新市万达铸业有限公司'
if
social_code
==
''
or
social_code
is
None
:
time
.
sleep
(
20
)
continue
if
'搜索不到'
in
social_code
:
continue
else
:
log
.
info
(
f
'====={com_code}===={company_id}=====获取企业id成功====='
)
# todo:企查查id写入gpdm表中
updateSql
=
f
"update gpdm set QCCID = '{company_id}' where gpdm = '{com_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
pass
dic_info
=
baseCore
.
getInfomation
(
social_code
)
log
.
info
(
f
'----当前企业{social_code}--开始处理---'
)
com_name
=
dic_info
[
1
]
#企查查id
company_id
=
dic_info
[
3
]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if
company_id
==
None
or
company_id
==
False
:
if
social_code
:
company_id
=
find_id_by_name
(
start_time
,
token
,
social_code
)
else
:
company_id
=
find_id_by_name
(
start_time
,
token
,
com_name
)
if
company_id
==
'null'
:
log
.
info
(
'=====搜索不到该企业===='
)
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
social_code
+
':搜索不到'
)
continue
if
not
company_id
:
log
.
info
(
com_name
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_name
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
com_name
)
baseCore
.
delete_token
(
token
)
log
.
info
(
'=====已重新放入redis,失效token已删除======'
)
time
.
sleep
(
20
)
continue
else
:
log
.
info
(
f
'====={com_name}===={company_id}=====获取企业id成功====='
)
# todo:写入数据库
updateqccid
=
f
"update China100 set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_
.
execute
(
updateqccid
)
cnx_
.
commit
()
try
:
post_data_list
=
info_by_id
(
company_id
,
''
,
com_code1
)
post_data_list
=
info_by_id
(
company_id
,
com_name
)
except
:
log
.
info
(
f
'====={com_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
com_code
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
rePutIntoR
(
'china100:baseinfo'
,
com_name
)
baseCore
.
delete_token
(
token
)
log
.
info
(
'=====已重新放入redis,失效token已删除======'
)
continue
if
post_data_list
:
pass
else
:
log
.
info
(
f
'======{com
_code}====企查查token失效===='
)
# log.info(f'======{social
_code}====企查查token失效====')
time
.
sleep
(
20
)
continue
for
post_data
in
post_data_list
:
list_all_info
.
append
(
post_data
)
#
list_all_info.append(post_data)
if
post_data
is
None
:
print
(
com_
cod
e
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_
cod
e
+
":企业信息获取失败"
)
print
(
com_
nam
e
+
":企业信息获取失败"
)
list_weicha
.
append
(
com_
nam
e
+
":企业信息获取失败"
)
continue
get_name
=
post_data
[
'name'
]
get_socialcode
=
post_data
[
'socialCreditCode'
]
#todo:将信用代码更新到表中
updatesocialcode
=
f
"update China100 set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_
.
execute
(
updatesocialcode
)
cnx_
.
commit
()
name_compile
=
{
'yuan_name'
:
com_
cod
e
,
'yuan_name'
:
com_
nam
e
,
'get_name'
:
get_name
}
name_list
.
append
(
name_compile
)
log
.
info
(
f
'采集{com_code}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}'
)
# dataList.append(post_data)
baseCore
.
writerToExcel
(
name_list
,
'中国100强企业.xlsx'
)
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}'
)
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
post_data
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
...
@@ -392,13 +425,9 @@ if __name__ == '__main__':
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
get_socialcode
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{get_name}--{get_socialcode}--kafka传输失败"
)
# 信息采集完成后将该企业的采集次数更新
# break
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
'_'
)[:
10
]
companyName
=
pd
.
DataFrame
(
name_list
)
companyName
.
to_excel
(
f
'./data/企业名称对比_{nowtime}.xlsx'
,
index
=
False
)
false_com
=
pd
.
DataFrame
(
list_weicha
)
false_com
.
to_excel
(
f
'./data/采集失败企业名单_{nowtime}.xlsx'
,
index
=
False
)
...
...
comData/
dfcfwGpdm/NQenterprise
/getQccId.py
→
comData/
newlist/china100
/getQccId.py
浏览文件 @
b923d30f
...
...
@@ -5,21 +5,43 @@ import time
from
urllib.parse
import
quote
import
requests
import
urllib3
from
base.
BaseCore
import
BaseCore
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'x-request-device-type'
:
'Android'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391'
,
'Content-Type'
:
'application/json'
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
'authMini'
:
'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5'
,
'xweb_xhr'
:
'1'
,
'xcx-version'
:
'2023.09.27'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-CurrentPage'
:
'/company-subpackages/business/index'
,
'Qcc-Timestamp'
:
'1696661787803'
,
'Qcc-RefPage'
:
'/company-subpackages/detail/index'
,
'Accept'
:
'*/*'
,
'Sec-Fetch-Site'
:
'cross-site'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def
find_id_by_name
(
start
,
token
,
name
):
urllib3
.
disable_warnings
()
...
...
@@ -32,8 +54,8 @@ def find_id_by_name(start,token,name):
try
:
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
break
except
:
print
(
'
重试'
)
except
Exception
as
e
:
print
(
f
'{e}-------------
重试'
)
time
.
sleep
(
5
)
continue
time
.
sleep
(
2
)
...
...
@@ -46,19 +68,23 @@ def find_id_by_name(start,token,name):
KeyNo
=
False
log
.
info
(
f
'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
if
resp_dict
[
'status'
]
==
40102
:
KeyNo
=
False
log
.
info
(
f
'=======无效的session=====时间{baseCore.getTimeCost(start, time.time())}'
)
return
KeyNo
try
:
if
resp_dict
[
'result'
][
'Result'
]:
result_dict
=
resp_dict
[
'result'
][
'Result'
][
0
]
KeyNo
=
result_dict
[
'KeyNo'
]
Name
=
result_dict
[
'Name'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
.
strip
()
if
Name
==
''
:
KeyNo
=
''
KeyNo
=
'
null
'
else
:
KeyNo
=
''
KeyNo
=
'
null
'
except
:
KeyNo
=
False
log
.
info
(
f
'====token失效====时间{baseCore.getTimeCost(start,time.time())}'
)
return
KeyNo
print
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
log
.
info
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
return
KeyNo
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论