Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
ed571f70
提交
ed571f70
authored
8月 16, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
66f88aa6
18c4a5a7
显示空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
475 行增加
和
360 行删除
+475
-360
BaseCore.py
base/BaseCore.py
+13
-7
RedisPPData.py
base/RedisPPData.py
+70
-15
getTycId.py
comData/tcyQydt/getTycId.py
+5
-1
tyc_qydt_add.py
comData/tcyQydt/tyc_qydt_add.py
+33
-23
oneWeixin.py
comData/weixin_solo/oneWeixin.py
+231
-257
test.py
comData/weixin_solo/test.py
+35
-26
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+62
-19
stealth.min.js
test/stealth.min.js
+0
-0
test.py
test/test.py
+26
-12
没有找到文件。
base/BaseCore.py
浏览文件 @
ed571f70
...
@@ -421,15 +421,15 @@ class BaseCore:
...
@@ -421,15 +421,15 @@ class BaseCore:
chrome_options
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
chrome_options
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
chrome_options
.
add_argument
(
'lang=zh-CN,zh,zh-TW,en-US,en'
)
chrome_options
.
add_argument
(
'lang=zh-CN,zh,zh-TW,en-US,en'
)
chrome_options
.
add_argument
(
self
.
getRandomUserAgent
())
chrome_options
.
add_argument
(
'user-agent='
+
self
.
getRandomUserAgent
())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
service
=
service
)
driver
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
service
=
service
)
with
open
(
'../../base/
stealth.min.js'
)
as
f
:
# with open(r'F:\zzsn\zzsn_spider\base\
stealth.min.js') as f:
js
=
f
.
read
()
#
js = f.read()
#
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
#
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source"
:
js
#
"source": js
})
#
})
return
driver
return
driver
# 根据社会信用代码获取企业信息
# 根据社会信用代码获取企业信息
...
@@ -458,6 +458,7 @@ class BaseCore:
...
@@ -458,6 +458,7 @@ class BaseCore:
print
(
e
)
print
(
e
)
self
.
cnx
.
commit
()
self
.
cnx
.
commit
()
#获取企查查token
def
GetToken
(
self
):
def
GetToken
(
self
):
#获取企查查token
#获取企查查token
query
=
"select token from QCC_token "
query
=
"select token from QCC_token "
...
@@ -476,6 +477,7 @@ class BaseCore:
...
@@ -476,6 +477,7 @@ class BaseCore:
return
'cn'
return
'cn'
return
result
[
0
]
return
result
[
0
]
#追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
# 读取已存在的xlsx文件
...
@@ -488,4 +490,8 @@ class BaseCore:
...
@@ -488,4 +490,8 @@ class BaseCore:
combined_data
.
to_excel
(
filename
,
index
=
False
)
combined_data
.
to_excel
(
filename
,
index
=
False
)
# return combined_data
# return combined_data
#对失败或者断掉的企业 重新放入redis
def
rePutIntoR
(
self
,
item
):
self
.
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
base/RedisPPData.py
浏览文件 @
ed571f70
...
@@ -29,28 +29,34 @@ r = basecore.r
...
@@ -29,28 +29,34 @@ r = basecore.r
# gn_social_list = [item[0] for item in gn_result]
# gn_social_list = [item[0] for item in gn_result]
# return gn_social_list,gw_social_list
# return gn_social_list,gw_social_list
#企业动态
def
NewsEnterprise
():
def
NewsEnterprise
():
#获取国内企业
#
#
获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
#
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
#
cursor.execute(gn_query)
gn_result
=
cursor
.
fetchall
()
#
gn_result = cursor.fetchall()
#获取国外企业
#获取国外企业
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
cursor
.
execute
(
gw_query
)
gw_result
=
cursor
.
fetchall
()
gw_result
=
cursor
.
fetchall
()
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
#todo:打印长度
# return gn_social_list, gw_social_list
print
(
len
(
gw_social_list
))
# gn_social_list = [item[0] for item in gn_result]
print
(
'======='
)
print
(
'======='
)
# gn_social_list,gw_social_list = pullDateFromSql()
#将数据插入到redis中
for
item
in
gn_social_list
:
r
.
rpush
(
'NewsEnterprise:gnqy_socialCode'
,
item
)
#将数据插入到redis中
# for item in gn_social_list:
# r.rpush('NewsEnterprise:gnqy_socialCode', item)
count
=
0
for
item
in
gw_social_list
:
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
count
+=
1
print
(
item
)
print
(
count
)
#企业动态定时任务
def
NewsEnterprise_task
():
def
NewsEnterprise_task
():
# 实例化一个调度器
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
scheduler
=
BlockingScheduler
()
...
@@ -63,6 +69,7 @@ def NewsEnterprise_task():
...
@@ -63,6 +69,7 @@ def NewsEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业公告
def
NoticeEnterprise
():
def
NoticeEnterprise
():
# 获取国内企业
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 1 "
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 1 "
...
@@ -72,7 +79,7 @@ def NoticeEnterprise():
...
@@ -72,7 +79,7 @@ def NoticeEnterprise():
print
(
'======='
)
print
(
'======='
)
for
item
in
gn_social_list
:
for
item
in
gn_social_list
:
r
.
rpush
(
'NoticeEnterprise:gnqy_socialCode'
,
item
)
r
.
rpush
(
'NoticeEnterprise:gnqy_socialCode'
,
item
)
#企业公告定时任务
def
NoticeEnterprise_task
():
def
NoticeEnterprise_task
():
# 实例化一个调度器
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
scheduler
=
BlockingScheduler
()
...
@@ -85,6 +92,7 @@ def NoticeEnterprise_task():
...
@@ -85,6 +92,7 @@ def NoticeEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业年报
def
AnnualEnterprise
():
def
AnnualEnterprise
():
# 获取国内企业
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
...
@@ -94,7 +102,7 @@ def AnnualEnterprise():
...
@@ -94,7 +102,7 @@ def AnnualEnterprise():
print
(
'======='
)
print
(
'======='
)
for
item
in
gn_social_list
:
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
#企业年报定时任务
def
AnnualEnterprise_task
():
def
AnnualEnterprise_task
():
# 实例化一个调度器
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
scheduler
=
BlockingScheduler
()
...
@@ -107,6 +115,7 @@ def AnnualEnterprise_task():
...
@@ -107,6 +115,7 @@ def AnnualEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业基本信息
def
BaseInfoEnterprise
():
def
BaseInfoEnterprise
():
# 获取国内企业
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
...
@@ -117,7 +126,7 @@ def BaseInfoEnterprise():
...
@@ -117,7 +126,7 @@ def BaseInfoEnterprise():
for
item
in
gn_social_list
:
for
item
in
gn_social_list
:
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
item
)
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
item
)
#企业基本信息
#企业基本信息
定时任务
def
BaseInfoEnterprise_task
():
def
BaseInfoEnterprise_task
():
# 实例化一个调度器
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
scheduler
=
BlockingScheduler
()
...
@@ -130,12 +139,58 @@ def BaseInfoEnterprise_task():
...
@@ -130,12 +139,58 @@ def BaseInfoEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#微信公众号
def
WeiXingetFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
cursor
.
execute
(
selectSql
)
results
=
cursor
.
fetchall
()
result_list
=
[
item
[
0
]
for
item
in
results
]
#放入redis
for
item
in
result_list
:
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
#微信公众号定时任务
def
weixin_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
WeiXingetFromSql
,
'cron'
,
hour
=
12
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
##福布斯=====从数据库中读取信息放入redis
def
FBS
():
# todo:调整为获取福布斯的数据库
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
gw_result
=
cursor
.
fetchall
()
# #获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
for
item
in
gn_social_list
:
r
.
rpush
(
'NewsEnterprise:gnqy_socialCode'
,
item
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
start
=
time
.
time
()
start
=
time
.
time
()
# NewsEnterprise_task()
# NewsEnterprise_task()
# NewsEnterprise()
FBS
()
# NoticeEnterprise_task()
# NoticeEnterprise_task()
AnnualEnterprise_task
()
#
AnnualEnterprise_task()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
# cnx.close()
# cnx.close()
# cursor.close()
# cursor.close()
...
...
comData/tcyQydt/getTycId.py
浏览文件 @
ed571f70
...
@@ -7,7 +7,7 @@ import pymysql
...
@@ -7,7 +7,7 @@ import pymysql
import
requests
import
requests
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
requests
.
adapters
.
DEFAULT_RETRIES
=
5
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
headers
=
{
headers
=
{
...
@@ -34,13 +34,17 @@ def getTycIdByXYDM(xydm):
...
@@ -34,13 +34,17 @@ def getTycIdByXYDM(xydm):
if
matchType
==
'信用代码匹配'
:
if
matchType
==
'信用代码匹配'
:
retData
[
'state'
]
=
True
retData
[
'state'
]
=
True
retData
[
'tycData'
]
=
retJsonData
[
'data'
][
0
]
retData
[
'tycData'
]
=
retJsonData
[
'data'
][
0
]
response
.
close
()
return
retData
return
retData
else
:
else
:
log
.
error
(
f
"{xydm}------{retJsonData}"
)
log
.
error
(
f
"{xydm}------{retJsonData}"
)
response
.
close
()
return
retData
return
retData
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
f
"{xydm}---exception---{e}"
)
log
.
error
(
f
"{xydm}---exception---{e}"
)
return
retData
return
retData
# 更新天眼查企业基本信息
# 更新天眼查企业基本信息
def
updateTycInfo
(
id
,
retData
):
def
updateTycInfo
(
id
,
retData
):
state
=
retData
[
'state'
]
state
=
retData
[
'state'
]
...
...
comData/tcyQydt/tyc_qydt_add.py
浏览文件 @
ed571f70
"""
增量采集:
取state为3、update_state为空的企业 表示上次采集成功的企业,
新增update_state字段,取一个企业更新为2,表示该企业正在采集。
采集完毕更新为1.
表示已经采集完成。跟据date_time 来排列 每次就不会拿到重复的数据。
okCount
errorCount
repectCount
新增三个字段分别对应更新的up_okCount up_errorCount up_repectCount ,
记录这些更新的数据 然后加到原来的数据上表示该企业已采集多少动态
8.8日改版,企业动态也传kafka
"""
import
json
import
json
import
requests
,
time
,
pymysql
import
requests
,
time
,
pymysql
import
jieba
import
jieba
import
sys
import
sys
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
getTycId
import
getTycIdByXYDM
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
base.smart
import
smart_extractor
from
base.smart
import
smart_extractor
# sys.path.append('D:\\KK\\zzsn_spider\\base')
# sys.path.append('D:\\KK\\zzsn_spider\\base')
...
@@ -49,12 +34,13 @@ headers = {
...
@@ -49,12 +34,13 @@ headers = {
'Referer'
:
'https://www.tianyancha.com/'
,
'Referer'
:
'https://www.tianyancha.com/'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
}
}
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
taskType
=
'企业动态/天眼查'
taskType
=
'企业动态/天眼查'
def
beinWork
(
tyc_code
,
social_code
):
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
start_time
=
time
.
time
()
time
.
sleep
(
3
)
time
.
sleep
(
3
)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData
=
{
'total'
:
0
,
'up_okCount'
:
0
,
'up_errorCount'
:
0
,
'up_repetCount'
:
0
}
retData
=
{
'total'
:
0
,
'up_okCount'
:
0
,
'up_errorCount'
:
0
,
'up_repetCount'
:
0
}
...
@@ -230,12 +216,13 @@ def beinWork(tyc_code, social_code):
...
@@ -230,12 +216,13 @@ def beinWork(tyc_code, social_code):
'sid'
:
'1684032033495392257'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
link
,
# 原文链接
'sourceAddress'
:
link
,
# 原文链接
'summary'
:
info_page
[
'abstracts'
],
'summary'
:
info_page
[
'abstracts'
],
'title'
:
contentText
,
'title'
:
title
,
'type'
:
2
,
'type'
:
2
,
'socialCreditCode'
:
social_code
,
'socialCreditCode'
:
social_code
,
'year'
:
time_format
[:
4
]
'year'
:
time_format
[:
4
]
}
}
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
e
=
'数据库传输失败'
e
=
'数据库传输失败'
state
=
0
state
=
0
...
@@ -263,6 +250,7 @@ def beinWork(tyc_code, social_code):
...
@@ -263,6 +250,7 @@ def beinWork(tyc_code, social_code):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
''
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
''
)
# return True
# return True
except
Exception
as
e
:
except
Exception
as
e
:
dic_result
=
{
dic_result
=
{
'success'
:
'false'
,
'success'
:
'false'
,
'message'
:
'操作失败'
,
'message'
:
'操作失败'
,
...
@@ -276,8 +264,6 @@ def beinWork(tyc_code, social_code):
...
@@ -276,8 +264,6 @@ def beinWork(tyc_code, social_code):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}"
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}"
)
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_repetCount'
]
=
up_repetCount
retData
[
'up_repetCount'
]
=
up_repetCount
...
@@ -295,10 +281,26 @@ def doJob():
...
@@ -295,10 +281,26 @@ def doJob():
if
social_code
==
'None'
:
if
social_code
==
'None'
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
start
=
time
.
time
()
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
data
=
baseCore
.
getInfomation
(
social_code
)
id
=
data
[
0
]
id
=
data
[
0
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
tycid
=
data
[
11
]
if
tycid
==
None
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
tycid
=
retData
[
'tycData'
][
'id'
]
#todo:写入数据库
updateSql
=
f
"update Enterprise set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
social_code
)
continue
count
=
data
[
17
]
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -308,7 +310,7 @@ def doJob():
...
@@ -308,7 +310,7 @@ def doJob():
# cnx.commit()
# cnx.commit()
# 开始采集企业动态
# 开始采集企业动态
retData
=
beinWork
(
tycid
,
xydm
)
retData
=
beinWork
(
tycid
,
xydm
,
start_time
)
# 信息采集完成后将该企业的采集次数更新
# 信息采集完成后将该企业的采集次数更新
runType
=
'NewsRunCount'
runType
=
'NewsRunCount'
count
+=
1
count
+=
1
...
@@ -319,6 +321,12 @@ def doJob():
...
@@ -319,6 +321,12 @@ def doJob():
up_repetCount
=
retData
[
'up_repetCount'
]
up_repetCount
=
retData
[
'up_repetCount'
]
log
.
info
(
log
.
info
(
f
"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}"
)
f
"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}"
)
except
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败===='
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取企业信息失败'
)
cursor
.
close
()
cursor
.
close
()
cnx
.
close
()
cnx
.
close
()
...
@@ -328,4 +336,6 @@ def doJob():
...
@@ -328,4 +336,6 @@ def doJob():
# Press the green button in the gutter to run the script.
# Press the green button in the gutter to run the script.
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
doJob
()
doJob
()
comData/weixin_solo/oneWeixin.py
浏览文件 @
ed571f70
# -*- coding: utf-8 -*-
'''
'''
记录一天能采多少公众号
记录一天能采多少公众号
,建一个数据库表 更新公众号的状态
'''
'''
import
requests
,
time
,
random
,
json
,
pymysql
,
redis
import
requests
,
time
,
random
,
json
,
pymysql
,
redis
...
@@ -17,13 +18,17 @@ from base.BaseCore import BaseCore
...
@@ -17,13 +18,17 @@ from base.BaseCore import BaseCore
import
os
import
os
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cnx
=
pymysql
.
connect
(
host
=
"114.116.44.11"
,
user
=
"root"
,
password
=
"f7s0&7qqtK"
,
db
=
"clb_project"
,
charset
=
"utf8mb4"
)
cursor
=
cnx
.
cursor
()
r
=
baseCore
.
r
urllib3
.
disable_warnings
()
urllib3
.
disable_warnings
()
def
check_url
(
sid
,
article_url
):
def
check_url
(
sid
,
article_url
):
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
)
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
)
res
=
r
.
sismember
(
f
'wx_url_{sid}'
,
article_url
)
# 注意是 保存set的方式
res
=
r
.
sismember
(
f
'wx_url_{sid}'
,
article_url
)
if
res
==
1
:
if
res
==
1
:
return
True
return
True
else
:
else
:
...
@@ -63,7 +68,7 @@ def get_proxy():
...
@@ -63,7 +68,7 @@ def get_proxy():
return
proxy_list
return
proxy_list
def
get_info
(
json_search
):
def
get_info
(
sid
,
json_search
):
num_caiji
=
0
num_caiji
=
0
kaishi_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
kaishi_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
obsClient
=
ObsClient
(
obsClient
=
ObsClient
(
...
@@ -81,9 +86,9 @@ def get_info(json_search):
...
@@ -81,9 +86,9 @@ def get_info(json_search):
url_news
=
one_news
[
'link'
]
url_news
=
one_news
[
'link'
]
#
url_ft = check_url(sid, url_news)
url_ft
=
check_url
(
sid
,
url_news
)
#
if url_ft:
if
url_ft
:
# return list_all_info,url_news,news_title
return
list_all_info
,
num_caiji
try
:
try
:
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
)
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
)
except
:
except
:
...
@@ -97,10 +102,24 @@ def get_info(json_search):
...
@@ -97,10 +102,24 @@ def get_info(json_search):
del
news_html
[
'class'
]
del
news_html
[
'class'
]
except
:
except
:
pass
pass
try
:
news_content
=
news_html
.
text
news_content
=
news_html
.
text
except
:
log
.
info
(
f
'--------内容为空--------{url_news}--------'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
news_title
,
url_news
,
news_html
,
'文章内容为空'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
continue
list_img
=
news_html
.
find_all
(
'img'
)
list_img
=
news_html
.
find_all
(
'img'
)
for
num_img
in
range
(
len
(
list_img
)):
for
num_img
in
range
(
len
(
list_img
)):
img_one
=
list_img
[
num_img
]
img_one
=
list_img
[
num_img
]
...
@@ -149,18 +168,19 @@ def get_info(json_search):
...
@@ -149,18 +168,19 @@ def get_info(json_search):
'source'
:
'11'
,
'source'
:
'11'
,
'createDate'
:
time_now
'createDate'
:
time_now
}
}
#
for nnn in range(0, 3):
for
nnn
in
range
(
0
,
3
):
#
try:
try
:
#
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
#
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_info
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
#
kafka_time_out = kafka_result.get(timeout=10)
kafka_time_out
=
kafka_result
.
get
(
timeout
=
10
)
#
# add_url(sid, url_news)
# add_url(sid, url_news)
#
break
break
#
except:
except
:
#
time.sleep(5)
time
.
sleep
(
5
)
#
continue
continue
num_caiji
=
num_caiji
+
1
num_caiji
=
num_caiji
+
1
list_all_info
.
append
(
dic_info
)
list_all_info
.
append
(
dic_info
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info2
=
{
dic_info2
=
{
'infoSourceId'
:
sid
,
'infoSourceId'
:
sid
,
...
@@ -171,19 +191,48 @@ def get_info(json_search):
...
@@ -171,19 +191,48 @@ def get_info(json_search):
'dispatcherStatus'
:
'1'
,
'dispatcherStatus'
:
'1'
,
'source'
:
'1'
,
'source'
:
'1'
,
}
}
#
for nnn2 in range(0, 3):
for
nnn2
in
range
(
0
,
3
):
#
try:
try
:
#
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
producer2
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
#
kafka_result2 = producer2.send("collectionAndDispatcherInfo",
kafka_result2
=
producer2
.
send
(
"collectionAndDispatcherInfo"
,
#
json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
json
.
dumps
(
dic_info2
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
#
break
break
#
except:
except
:
#
time.sleep(5)
time
.
sleep
(
5
)
#
continue
continue
return
list_all_info
,
url_news
,
news_title
return
list_all_info
,
num_caiji
if
__name__
==
"__main__"
:
#定时
def
getFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
cursor
.
execute
(
selectSql
)
results
=
cursor
.
fetchall
()
result_list
=
[
item
[
0
]
for
item
in
results
]
#放入redis
for
item
in
result_list
:
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
#刷新浏览器并获得token
def
flushAndGetToken
(
list_b
):
browser_run
=
list_b
[
0
]
log
.
info
(
'======刷新浏览器====='
)
browser_run
.
refresh
()
cookie_list
=
browser_run
.
get_cookies
()
cur_url
=
browser_run
.
current_url
token
=
cur_url
.
split
(
'token='
)[
1
]
log
.
info
(
f
'===========当前token为:{token}============'
)
cookies
=
{}
for
cookie
in
cookie_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
return
token
,
cookies
#采集失败的公众号 重新放入redis
def
rePutIntoR
(
item
):
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
if
__name__
==
"__main__"
:
requests
.
DEFAULT_RETRIES
=
5
time_start
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_start
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
print
(
f
'开始时间为:{time_start}'
)
print
(
f
'开始时间为:{time_start}'
)
...
@@ -195,288 +244,213 @@ if __name__=="__main__":
...
@@ -195,288 +244,213 @@ if __name__=="__main__":
opt
=
webdriver
.
ChromeOptions
()
opt
=
webdriver
.
ChromeOptions
()
opt
.
add_argument
(
opt
.
add_argument
(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
)
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
)
# opt.add_argument(f"--proxy-server={ip}")
opt
.
add_argument
(
"--ignore-certificate-errors"
)
opt
.
add_argument
(
"--ignore-certificate-errors"
)
opt
.
add_argument
(
"--ignore-ssl-errors"
)
opt
.
add_argument
(
"--ignore-ssl-errors"
)
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
binary_location
=
r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
#
opt.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
chromedriver
=
r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
#
chromedriver = r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
chromedriver
=
r'D:/chrome/chromedriver.exe'
browser1
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser1
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
list_b
=
[
browser1
]
list_b
=
[
browser1
]
url
=
"https://mp.weixin.qq.com/"
url
=
"https://mp.weixin.qq.com/"
browser1
.
get
(
url
)
browser1
.
get
(
url
)
# browser2.get(url)
# browser3.get(url)
# 可改动
# 可改动
time
.
sleep
(
30
)
time
.
sleep
(
30
)
num_b
=
0
num_b
=
0
browser_run
=
list_b
[
0
]
log
.
info
(
'======刷新浏览器====='
)
# todo:从数据库里读信息,放入redis,定时任务 每天放入数据
browser_run
.
refresh
()
# getFromSql()
cookie_list
=
browser_run
.
get_cookies
()
cur_url
=
browser_run
.
current_url
token
=
cur_url
.
split
(
'token='
)[
1
]
log
.
info
(
f
'===========当前token为:{token}============'
)
cookies
=
{}
for
cookie
in
cookie_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
s
=
requests
.
session
()
s
=
requests
.
session
()
# 记录运行公众号的个数
# 记录运行公众号的个数
count
=
0
count
=
0
while
True
:
while
True
:
all
=
[]
# 刷新浏览器并获取当前token和cookie
token
,
cookies
=
flushAndGetToken
(
list_b
)
list_all_info
=
[]
list_all_info
=
[]
list_error_url
=
[]
list_laiyuan
=
[]
cnx
=
pymysql
.
connect
(
host
=
"114.116.44.11"
,
user
=
"root"
,
password
=
"f7s0&7qqtK"
,
db
=
"clb_project"
,
charset
=
"utf8mb4"
)
log
.
info
(
'===========获取公众号============'
)
log
.
info
(
'===========获取公众号============'
)
start_
=
time
.
time
()
start_
=
time
.
time
()
with
cnx
.
cursor
()
as
cursor
:
# #todo:redis中数据 pop一条
sql
=
"SELECT site_uri,id,site_name,info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
if
infoSourceCode
==
'None'
:
#当一次采集完之后,重新插入数据并等待插入完成
getFromSql
()
time
.
sleep
(
20
)
log
.
info
(
f
'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}'
)
continue
sql
=
f
"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛'
# sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where site_name = '一带一路百人论坛' "
cursor
.
execute
(
sql
)
cursor
.
execute
(
sql
)
rows
=
cursor
.
fetchall
()
row
=
cursor
.
fetchone
()
# 将数据库中的数据切分为两部分
for
row
in
rows
:
# print(len(rows[:945]))
# if row[2]=='南方周末':
dic_url
=
{
dic_url
=
{
'url
'
:
row
[
0
],
'url_
'
:
row
[
0
],
'sid'
:
row
[
1
],
'sid'
:
row
[
1
],
'name'
:
row
[
2
],
'name'
:
row
[
2
],
'info_source_code'
:
row
[
3
],
'info_source_code'
:
row
[
3
],
'biz'
:
''
'biz'
:
''
}
}
list_laiyuan
.
append
(
dic_url
)
log
.
info
(
f
'===========获取公众号完成,耗时{baseCore.getTimeCost(start_,time.time())}============'
)
# list_laiyuan.reverse()
log
.
info
(
'===========获取biz=========='
)
log
.
info
(
'===========获取biz=========='
)
start__
=
time
.
time
()
s
.
cookies
.
update
(
cookies
)
for
dic_one
in
list_laiyuan
:
s
.
keep_alive
=
False
url
=
dic_one
[
'url'
]
url_
=
dic_url
[
'url_'
]
origin
=
dic_url
[
'name'
]
info_source_code
=
dic_url
[
'info_source_code'
]
sid
=
dic_url
[
'sid'
]
try
:
try
:
biz
=
url
.
split
(
'__biz='
)[
1
]
.
split
(
'==&'
)[
0
]
.
split
(
'='
)[
0
]
biz
=
url_
.
split
(
'__biz='
)[
1
]
.
split
(
'==&'
)[
0
]
.
split
(
'='
)[
0
]
dic_one
[
'biz'
]
=
biz
dic_url
[
'biz'
]
=
biz
except
:
except
Exception
as
e
:
continue
log
.
info
(
f
'---公众号--{origin}---biz错误'
)
log
.
info
(
f
'==========获取biz完成,耗时{baseCore.getTimeCost(start__,time.time())}=========='
)
# list_biz.append(biz)
# list_laiyuan.reverse()
#记录错误的biz及相关信息
biz_error_biz
=
[]
biz_error_origin
=
[]
biz_error_code
=
[]
#记录解析成功但采集失败的相关信息
get_error_biz
=
[]
get_error_origin
=
[]
get_error_code
=
[]
#记录解析失败的相关信息
json_error_biz
=
[]
json_error_origin
=
[]
json_error_code
=
[]
for
num_biz
in
range
(
0
,
len
(
list_laiyuan
)):
browser_run
.
refresh
()
cookie_list
=
browser_run
.
get_cookies
()
cur_url
=
browser_run
.
current_url
token
=
cur_url
.
split
(
'token='
)[
1
]
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
log
.
info
(
f
'=========刷新时间:{time_now}========='
)
error
=
[
log
.
info
(
f
'=========当前token为:{token}========='
)
origin
,
cookies
=
{}
url_
,
for
cookie
in
cookie_list
:
info_source_code
,
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
e
,
'biz错误'
,
list_url
=
[]
time_now
s
.
cookies
.
update
(
cookies
)
]
sid
=
list_laiyuan
[
num_biz
][
'sid'
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
origin
=
list_laiyuan
[
num_biz
][
'name'
]
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
info_source_code
=
list_laiyuan
[
num_biz
][
'info_source_code'
]
cnx_
.
commit
()
biz
=
list_laiyuan
[
num_biz
][
'biz'
]
if
biz
:
pass
else
:
continue
continue
fakeid
=
biz
+
'=='
fakeid
=
biz
+
'=='
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
try
:
try
:
ip
=
get_proxy
()[
random
.
randint
(
0
,
3
)]
ip
=
get_proxy
()[
random
.
randint
(
0
,
3
)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
proxies
=
ip
,
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
except
:
except
Exception
as
e
:
log
.
info
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}==='
)
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
# error_text = str(json_search)
rePutIntoR
(
info_source_code
)
json_search
=
''
continue
aa
=
time
.
sleep
(
600
)
#{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
log
.
info
(
f
'======等待时间{aa}======='
)
# TODO:需要判断返回值,根据返回值判断是封号还是biz错误
break
# {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}========= 封号
try
:
# {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}} 公众号biz错误 链接
list_all
=
json_search
[
'app_msg_list'
]
# 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
except
:
ret
=
json_search
[
'base_resp'
][
'ret'
]
#解析失败的情况
if
ret
==
0
:
count
+=
1
# (f'{fakeid}:biz错误!')
log
.
info
(
f
'{fakeid}:biz错误!、公众号为{origin}=====当前时间:{baseCore.getNowTime(1)}'
)
biz_error_biz
.
append
(
biz
)
biz_error_origin
.
append
(
origin
)
biz_error_code
.
append
(
info_source_code
)
df_error_biz
=
pd
.
DataFrame
({
'公众号'
:
biz_error_origin
,
'code'
:
biz_error_code
,
'错误biz'
:
biz_error_biz
,
})
excel_name
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
())
#原来:
# df_error_biz.to_excel(f'./错误biz/{excel_name}.xlsx', index=False)
#改为:
file_path
=
f
'./错误biz/{excel_name}.xlsx'
if
os
.
path
.
exists
(
file_path
):
pass
pass
else
:
elif
ret
==
200013
:
workbook
=
Workbook
()
# 重新放入redis
workbook
.
save
(
file_path
)
# time.sleep(3600)
workbook
.
close
()
# 刷新 暂时用一下方法
# with pd.ExcelWriter(file_path, engine='xlsxwriter',
rePutIntoR
(
info_source_code
)
# options={'strings_to_urls': False}) as writer:
log
.
info
(
f
'======该账号被封======='
)
for
i
in
range
(
0
,
6
):
#600,1200,1800,2400,3000,3600
baseCore
.
writerToExcel
(
df_error_biz
,
file_path
)
# combined_data.to_excel(writer, index=False)
bb
=
time
.
sleep
(
3600
)
log
.
info
(
f
'========当前账号可能被封,等待时长{bb}======'
)
#刷新
#刷新
log
.
info
(
f
'=============刷新浏览器============='
)
wait_time
=
time
.
sleep
(
600
)
log
.
info
(
f
'=======等待时间{wait_time}秒=====刷新浏览器====='
)
browser_run
=
list_b
[
0
]
browser_run
.
refresh
()
browser_run
.
refresh
()
cookie_list
=
browser_run
.
get_cookies
()
continue
cur_url
=
browser_run
.
current_url
elif
ret
==
200002
:
token
=
cur_url
.
split
(
'token='
)[
1
]
# 公众号链接错误 保存库里 记录错误信息及错误类型
log
.
info
(
f
'=========当前token:{token}========='
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
cookies
=
{}
error
=
[
for
cookie
in
cookie_list
:
origin
,
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
url_
,
info_source_code
,
str_t
,
'无效biz参数'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}'
)
continue
elif
ret
==
200003
:
# 无效的session
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
error
=
[
origin
,
url_
,
info_source_code
,
str_t
,
'无效session'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
continue
else
:
log
.
info
(
f
'----其他情况-----{json_search}---公众号{origin}------'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
error
=
[
origin
,
url_
,
info_source_code
,
str_t
,
'其他错误'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
continue
continue
if
list_all
:
list_all
=
json_search
[
'app_msg_list'
]
str_t
=
json
.
dumps
(
json_search
)
try
:
try
:
list_all_info
,
url_news
,
news_title
=
get_info
(
json_search
)
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
if
len
(
list_all_info
)
!=
0
:
count
+=
1
count
+=
1
if
len
(
list_all_info
):
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
for
dic_one
in
list_all_info
:
success
=
[
origin
,
all
.
append
(
dic_one
)
url_
,
# df_info = pd.DataFrame(all)
info_source_code
,
excel_name
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
())
'采集成功'
,
num_caiji
,
try
:
time_now
file_path
=
f
'./运行结果/{excel_name}_实时数据.xlsx'
]
if
os
.
path
.
exists
(
file_path
):
#成功信息保存
pass
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
else
:
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
workbook
=
Workbook
()
cnx_
.
commit
()
workbook
.
save
(
file_path
)
workbook
.
close
()
# df_info.to_excel(f'./运行结果/{excel_name}_实时数据.xlsx', index=False)
# with pd.ExcelWriter(file_path, engine='xlsxwriter',
# options={'strings_to_urls': False}) as writer:
baseCore
.
writerToExcel
(
all
,
file_path
)
# combined_data.to_excel(writer, index=False)
except
:
file_path
=
f
'./运行结果/{excel_name}_2_实时数据.xlsx'
if
os
.
path
.
exists
(
file_path
):
pass
else
:
workbook
=
Workbook
()
workbook
.
save
(
file_path
)
workbook
.
close
()
# df_info.to_excel(f'./运行结果/{excel_name}_2_实时数据.xlsx', index=False)
# with pd.ExcelWriter(file_path, engine='xlsxwriter',
# options={'strings_to_urls': False}) as writer:
baseCore
.
writerToExcel
(
all
,
file_path
)
# combined_data.to_excel(writer, index=False)
# 该公众号的所有文章采集完成
# 该公众号的所有文章采集完成
# print(f'{fakeid}:采集成功!')
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_,time.time())}'
)
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号'
)
else
:
else
:
log
.
info
(
f
'{fakeid}、公众号{origin}:{url_news},{news_title}已采集过该文章!、已采集{count}个公众号'
)
log
.
info
(
f
'{fakeid}、公众号{origin}、网址已存在!耗时{baseCore.getTimeCost(start_,time.time())}'
)
except
Exception
as
e
:
except
:
# json解析该公众号成功但采集数据失败
# json解析该公众号成功但采集数据失败
count
+=
1
count
+=
1
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!已采集{count}个公众号'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# print(f'{fakeid}:解析失败!!!!!!')
false
=
[
list_error_url
.
append
(
str_t
)
origin
,
get_error_origin
.
append
(
origin
)
url_
,
get_error_code
.
append
(
info_source_code
)
info_source_code
,
excel_name
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
())
e
,
'采集失败'
,
df_error_url
=
pd
.
DataFrame
({
'公众号:'
:
get_error_origin
,
time_now
'code'
:
get_error_code
,
]
'信息'
:
list_error_url
})
# 失败信息保存
file_path
=
f
'./保存失败/{excel_name}.xlsx'
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
if
os
.
path
.
exists
(
file_path
):
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
pass
cnx_
.
commit
()
else
:
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
workbook
=
Workbook
()
workbook
.
save
(
file_path
)
workbook
.
close
()
# df_error_url.to_excel(f'./保存失败/{excel_name}.xlsx', index=False)
# with pd.ExcelWriter(file_path,engine='xlsxwriter',options={'strings_to_urls':False}) as writer:
baseCore
.
writerToExcel
(
df_error_url
,
file_path
)
# combined_data.to_excel(writer,index=False)
time
.
sleep
(
1
)
else
:
# list_all为空
count
+=
1
time_end
=
time
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H-
%
M-
%
S"
,
time
.
localtime
())
# print(f'{fakeid}:运行出错!时间为:{time_end}')
log
.
info
(
f
'{fakeid}、公众号{origin}:list_all为空!已采集{count}个公众号、时间为:{time_end}'
)
json_error_biz
.
append
(
fakeid
)
json_error_origin
.
append
(
origin
)
json_error_code
.
append
(
info_source_code
)
df_error_json
=
pd
.
DataFrame
({
'公众号:'
:
json_error_origin
,
'code'
:
json_error_code
,
'信息'
:
json_error_biz
})
file_path
=
f
'./错误文件/{time_end}.xlsx'
if
os
.
path
.
exists
(
file_path
):
pass
else
:
workbook
=
Workbook
()
workbook
.
save
(
file_path
)
workbook
.
close
()
# df_error_json.to_excel(f'./错误文件/{time_end}.xlsx', index=False)
# with pd.ExcelWriter(file_path, engine='xlsxwriter',
# options={'strings_to_urls': False}) as writer:
baseCore
.
writerToExcel
(
df_error_json
,
file_path
)
# combined_data.to_excel(writer, index=False)
time_end
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
log
.
info
(
f
'运行结束,时间为:{time_end}'
)
print
(
f
'运行结束,时间为:{time_end}'
)
df_info
=
pd
.
DataFrame
(
list_all_info
)
excel_name
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
,
time
.
localtime
())
df_info
.
to_excel
(
f
'./运行结果/{excel_name}_总数据.xlsx'
,
index
=
False
)
list_b
[
0
]
.
refresh
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
#关闭资源
cnx
.
close
()
cursor
.
close
()
baseCore
.
close
()
comData/weixin_solo/test.py
浏览文件 @
ed571f70
import
pandas
as
pd
import
pandas
as
pd
def
writeaa
():
#
def writeaa():
detailList
=
[]
#
detailList=[]
aa
=
{
#
aa={
'id'
:
3
,
#
'id':3,
'name'
:
'qqqwe'
#
'name':'qqqwe'
}
#
}
detailList
.
append
(
aa
)
#
detailList.append(aa)
writerToExcel
(
detailList
)
#
writerToExcel(detailList)
# 将数据追加到excel
# 将数据追加到excel
def
writerToExcel
(
detailList
):
# def writerToExcel(detailList):
# filename='baidu搜索.xlsx'
# # filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
# # 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
)
# existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据
# # 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
# # 将新数据添加到现有数据的末尾
combined_data
=
existing_data
.
append
(
new_data
,
ignore_index
=
True
)
# combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
# # 将结果写入到xlsx文件
combined_data
.
to_excel
(
filename
,
index
=
False
)
# combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
from
openpyxl
import
Workbook
gpdm
=
'01109.HK'
if
'HK'
in
str
(
gpdm
):
if
__name__
==
'__main__'
:
tmp_g
=
str
(
gpdm
)
.
split
(
'.'
)[
0
]
filename
=
'test1.xlsx'
if
len
(
tmp_g
)
==
5
:
# # 创建一个工作簿
gpdm
=
str
(
gpdm
)[
1
:]
workbook
=
Workbook
(
filename
)
print
(
gpdm
)
workbook
.
save
(
filename
)
else
:
writeaa
()
pass
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
ed571f70
# 雅虎财
经企业动态获取
# 雅虎财
经企业动态获取
...
@@ -5,14 +5,18 @@ import pymysql
...
@@ -5,14 +5,18 @@ import pymysql
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.by
import
By
import
sys
import
sys
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.wait
import
WebDriverWait
from
base
import
BaseCore
from
base.smart
import
smart_extractor
sys
.
path
.
append
(
'D:/zzsn_spider/base'
)
import
BaseCore
from
smart
import
smart_extractor
import
urllib3
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
.
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
r
=
baseCore
.
r
taskType
=
'企业动态/雅虎财经'
taskType
=
'企业动态/雅虎财经'
smart
=
smart_extractor
.
SmartExtractor
(
'cn'
)
smart
=
smart_extractor
.
SmartExtractor
(
'cn'
)
...
@@ -45,6 +49,8 @@ def getZx(xydm, url, title, cnx, path):
...
@@ -45,6 +49,8 @@ def getZx(xydm, url, title, cnx, path):
content
=
contentElement
.
replace
(
"'"
,
"''"
)
content
=
contentElement
.
replace
(
"'"
,
"''"
)
driverContent
.
close
()
driverContent
.
close
()
# driverContent.quit()
# 动态信息列表
# 动态信息列表
list_info
=
[
list_info
=
[
xydm
,
xydm
,
...
@@ -85,8 +91,8 @@ def getZx(xydm, url, title, cnx, path):
...
@@ -85,8 +91,8 @@ def getZx(xydm, url, title, cnx, path):
'deleteFlag'
:
'0'
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'id'
:
''
,
'keyWords'
:
''
,
'keyWords'
:
''
,
'lang'
:
'
zh
'
,
'lang'
:
'
en
'
,
'origin'
:
'
天眼查
'
,
'origin'
:
'
雅虎财经
'
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'sid'
:
'1684032033495392257'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
url
,
# 原文链接
'sourceAddress'
:
url
,
# 原文链接
...
@@ -155,40 +161,44 @@ def getLastUrl():
...
@@ -155,40 +161,44 @@ def getLastUrl():
def
scroll
(
xydm
,
name
,
gpdm
):
def
scroll
(
xydm
,
name
,
gpdm
):
last_url_
=
''
last_url_
=
''
try
:
last_url
=
getLastUrl
()
except
:
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
while
True
:
while
True
:
js
=
"var q=document.documentElement.scrollTop=100000"
js
=
"var q=document.documentElement.scrollTop=100000"
driver
.
execute_script
(
js
)
driver
.
execute_script
(
js
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
try
:
try
:
last_url
_
=
getLastUrl
()
last_url
=
getLastUrl
()
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
break
break
try
:
#
try:
selects
=
selectUrl
(
last_url_
,
xydm
)
#
selects = selectUrl(last_url_,xydm)
except
:
#
except:
break
#
break
if
selects
:
#
if selects:
break
#
break
if
last_url_
==
last_url
:
if
last_url_
==
last_url
:
break
break
last_url
=
last_url_
last_url_
=
last_url
#采集失败的公众号 重新放入redis
def
rePutIntoR
(
item
):
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
path
=
r'
D:\zzsn_spider\comData\cmd6
\chromedriver.exe'
path
=
r'
F:\spider\115
\chromedriver.exe'
driver
=
baseCore
.
buildDriver
(
path
)
driver
=
baseCore
.
buildDriver
(
path
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
cursor
=
cnx
.
cursor
()
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gwqy_socialCode'
)
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gwqy_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
if
not
social_code
:
time
.
sleep
(
20
)
continue
if
social_code
==
'None'
:
if
social_code
==
'None'
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
@@ -214,10 +224,13 @@ if __name__ == "__main__":
...
@@ -214,10 +224,13 @@ if __name__ == "__main__":
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
continue
continue
try
:
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
driver
.
get
(
url
)
try
:
try
:
WebDriverWait
(
driver
,
15
)
.
until
(
EC
.
visibility_of_element_located
((
By
.
ID
,
'summaryPressStream-0-Stream'
)))
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
news_div
.
find_element
(
By
.
TAG_NAME
,
'a'
)
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
f
"{name}--{gpdm}--没找到新闻元素"
)
log
.
error
(
f
"{name}--{gpdm}--没找到新闻元素"
)
exception
=
'没找到新闻元素'
exception
=
'没找到新闻元素'
...
@@ -232,16 +245,30 @@ if __name__ == "__main__":
...
@@ -232,16 +245,30 @@ if __name__ == "__main__":
log
.
error
(
f
"{name}--{gpdm}--拖拽出现问题"
)
log
.
error
(
f
"{name}--{gpdm}--拖拽出现问题"
)
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
log
.
info
(
f
"{name}--{gpdm}--{len(news_lis)}条信息"
)
log
.
info
(
f
"{name}--{gpdm}--{len(news_lis)}条信息"
)
#标识符 判断脚本是否断开连接
flag
=
0
for
i
in
range
(
0
,
len
(
news_lis
)):
for
i
in
range
(
0
,
len
(
news_lis
)):
try
:
try
:
try
:
a_ele
=
news_lis
[
i
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[2]/h3[1]/a"
)
a_ele
=
news_lis
[
i
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[2]/h3[1]/a"
)
except
:
a_ele
=
news_lis
[
i
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[1]/h3[1]/a"
)
except
Exception
as
e
:
except
Exception
as
e
:
if
news_lis
[
i
]
.
is_displayed
():
log
.
error
(
f
"{name}--{gpdm}--{i}----a标签没找到"
)
log
.
error
(
f
"{name}--{gpdm}--{i}----a标签没找到"
)
exception
=
'a标签没找到'
exception
=
'a标签没找到'
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exception
)
continue
continue
else
:
log
.
error
(
f
"{name}--{gpdm}--{i}----与网站断开连接"
)
#todo:重新放入redis
rePutIntoR
(
xydm
)
time
.
sleep
(
300
)
flag
=
1
break
news_url
=
a_ele
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
news_url
=
a_ele
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
if
(
news_url
.
startswith
(
"https://finance.yahoo.com"
)):
if
(
news_url
.
startswith
(
"https://finance.yahoo.com"
)):
pass
pass
...
@@ -257,6 +284,9 @@ if __name__ == "__main__":
...
@@ -257,6 +284,9 @@ if __name__ == "__main__":
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
# 增量使用
# break
# 全量使用
continue
continue
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
exception
=
getZx
(
xydm
,
news_url
,
title
,
cnx
,
path
)
exception
=
getZx
(
xydm
,
news_url
,
title
,
cnx
,
path
)
...
@@ -268,12 +298,25 @@ if __name__ == "__main__":
...
@@ -268,12 +298,25 @@ if __name__ == "__main__":
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
log
.
info
(
f
"{name}--{gpdm}--{i}----{news_url}"
)
log
.
info
(
f
"{name}--{gpdm}--{i}----{news_url}"
)
if
flag
==
1
:
continue
log
.
info
(
f
"{name}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time, time.time())}"
)
log
.
info
(
f
"{name}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time, time.time())}"
)
# 信息采集完成后将该企业的采集次数更新
# 信息采集完成后将该企业的采集次数更新
runType
=
'NewsRunCount'
runType
=
'NewsRunCount'
count
+=
1
count
+=
1
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
except
:
rePutIntoR
(
xydm
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
'远程主机强迫关闭了一个现有的连接。'
)
log
.
info
(
f
"-------{name}--{gpdm}---'远程主机强迫关闭了一个现有的连接。'--------"
)
log
.
info
(
'===========连接已被关闭========等待重新连接==========='
)
driver
.
quit
()
driver
=
baseCore
.
buildDriver
(
path
)
time
.
sleep
(
5
)
continue
cursor
.
close
()
cursor
.
close
()
cnx
.
close
()
cnx
.
close
()
...
...
test/stealth.min.js
0 → 100644
浏览文件 @
ed571f70
This source diff could not be displayed because it is too large. You can
view the blob
instead.
test/test.py
浏览文件 @
ed571f70
import
time
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
...
@@ -6,13 +8,25 @@ log =baseCore.getLogger()
...
@@ -6,13 +8,25 @@ log =baseCore.getLogger()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
log
.
info
(
"ok"
)
path
=
r'F:\spider\115\chromedriver.exe'
#获取流水号
driver
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
print
(
baseCore
.
getNextSeq
())
# service = Service(r'F:\spider\115\chromedriver.exe')
print
(
baseCore
.
getNextSeq
())
# chrome_options = webdriver.ChromeOptions()
# 获取随机agent
# # chrome_options.add_argument('--headless')
print
(
baseCore
.
getRandomUserAgent
())
# # chrome_options.add_argument('--disable-gpu')
# 获取代理池
# chrome_options.add_experimental_option(
print
(
baseCore
.
get_proxy
())
# "excludeSwitches", ["enable-automation"])
# 释放相关资源
# chrome_options.add_experimental_option('useAutomationExtension', False)
baseCore
.
close
()
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
\ No newline at end of file
# chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent())
#
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
gpdm
=
'9021.T'
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论