Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
70f8ebff
提交
70f8ebff
authored
8月 22, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
f1e58921
27673f2e
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
806 行增加
和
147 行删除
+806
-147
BaseCore.py
base/BaseCore.py
+5
-4
RedisPPData.py
base/RedisPPData.py
+127
-41
东方财富网财务数据.py
comData/caiwushuju/东方财富网财务数据.py
+537
-0
gpdm.py
comData/dfcfwGpdm/gpdm.py
+25
-6
oneWeixin.py
comData/weixin_solo/oneWeixin.py
+112
-96
没有找到文件。
base/BaseCore.py
浏览文件 @
70f8ebff
...
...
@@ -17,14 +17,15 @@ import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
__seq
=
0
# 代理池 数据库连接
__cnx_proxy
=
None
__cursor_proxy
=
None
cnx
=
None
cursor
=
None
r
=
None
# agent 池
__USER_AGENT_LIST
=
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
...
...
@@ -392,7 +393,7 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
item
=
self
.
r
.
r
pop
(
key
)
item
=
self
.
r
.
l
pop
(
key
)
return
item
.
decode
()
if
item
else
None
# 获得脚本进程PID
...
...
@@ -480,7 +481,7 @@ class BaseCore:
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
)
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
...
...
base/RedisPPData.py
浏览文件 @
70f8ebff
import
time
import
pymysql
from
base
import
BaseCore
from
apscheduler.schedulers.blocking
import
BlockingScheduler
basecore
=
BaseCore
.
BaseCore
()
log
=
basecore
.
getLogger
()
cnx
=
basecore
.
cnx
cursor
=
basecore
.
cursor
r
=
basecore
.
r
def
cnn11
():
#11数据库
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor_
=
cnx_
.
cursor
()
return
cnx_
,
cursor_
def
close11
(
cnx_
,
cursor_
):
cnx_
.
close
()
cursor_
.
close
()
# # 连接到Redis
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
#
...
...
@@ -30,11 +40,24 @@ r = basecore.r
# return gn_social_list,gw_social_list
#企业动态
#创建数据库连接
def
connectSql
():
cnx
=
basecore
.
cnx
cursor
=
basecore
.
cursor
return
cnx
,
cursor
#关闭数据库连接
def
closeSql
(
cnx
,
cursor
):
cnx
.
close
()
cursor
.
close
()
def
NewsEnterprise
():
cnx
,
cursor
=
connectSql
()
# #获取国内企业
#
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
#
cursor.execute(gn_query)
#
gn_result = cursor.fetchall()
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
#获取国外企业
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
...
...
@@ -42,28 +65,26 @@ def NewsEnterprise():
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
#todo:打印长度
print
(
len
(
gw_social_list
))
# gn_social_list = [item[0] for item in gn_result]
# print(len(gw_social_list))
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
#将数据插入到redis中
#
for item in gn_social_list:
#
r.rpush('NewsEnterprise:gnqy_socialCode', item)
count
=
0
for
item
in
gn_social_list
:
r
.
rpush
(
'NewsEnterprise:gnqy_socialCode'
,
item
)
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
count
+=
1
print
(
item
)
print
(
count
)
closeSql
(
cnx
,
cursor
)
#企业动态定时任务
def
NewsEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
NewsEnterprise
,
'cron'
,
hour
=
12
,
minute
=
0
,
max_instances
=
2
)
scheduler
.
add_job
(
NewsEnterprise
,
'cron'
,
hour
=
0
,
minute
=
0
,
max_instances
=
2
)
try
:
# redisPushData # 定时开始前执行一次
NewsEnterprise
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
...
...
@@ -71,14 +92,17 @@ def NewsEnterprise_task():
#企业公告
def
NoticeEnterprise
():
cnx
,
cursor
=
connectSql
()
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null
limit 10
"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null "
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'NoticeEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业公告定时任务
def
NoticeEnterprise_task
():
# 实例化一个调度器
...
...
@@ -94,14 +118,16 @@ def NoticeEnterprise_task():
#企业年报
def
AnnualEnterprise
():
cnx
,
cursor
=
connectSql
()
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null
limit 10
"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业年报定时任务
def
AnnualEnterprise_task
():
...
...
@@ -110,7 +136,8 @@ def AnnualEnterprise_task():
# 每年执行一次
scheduler
.
add_job
(
AnnualEnterprise
,
'cron'
,
second
=
'*/10'
)
try
:
# redisPushData # 定时开始前执行一次
# 定时开始前执行一次
AnnualEnterprise
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
...
...
@@ -118,23 +145,51 @@ def AnnualEnterprise_task():
#企业基本信息
def
BaseInfoEnterprise
():
cnx
,
cursor
=
connectSql
()
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'
limit 10
"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业基本信息定时任务
def
BaseInfoEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每年执行一次
scheduler
.
add_job
(
BaseInfoEnterprise
,
'cron'
,
second
=
'*/10'
)
scheduler
.
add_job
(
BaseInfoEnterprise
,
'cron'
,
month
=
1
,
day
=
1
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
# 定时开始前执行一次
BaseInfoEnterprise
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#东方财富网财务数据
def
FinanceFromEast
():
cnx_
,
cursor_
=
cnn11
()
#从上市企业库中读取数据
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where category = '1' limit 10 '''
cursor_
.
execute
(
sql_sel
)
finance
=
cursor_
.
fetchall
()
finance_list
=
[
item
[
0
]
for
item
in
finance
]
print
(
'======='
)
for
item
in
finance_list
:
r
.
rpush
(
'FinanceFromEast:finance_socialCode'
,
item
)
close11
(
cnx_
,
cursor_
)
def
FinanceFromEase_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个季度执行一次
scheduler
.
add_job
(
FinanceFromEast
,
'cron'
,
month
=
'1-12/3'
,
day
=
'1'
,
hour
=
0
,
minute
=
0
)
try
:
# 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
...
...
@@ -142,29 +197,33 @@ def BaseInfoEnterprise_task():
#微信公众号
def
WeiXingetFromSql
():
cnx_
,
cursor_
=
cnn11
()
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
cursor
.
execute
(
selectSql
)
results
=
cursor
.
fetchall
()
cursor
_
.
execute
(
selectSql
)
results
=
cursor
_
.
fetchall
()
result_list
=
[
item
[
0
]
for
item
in
results
]
#放入redis
for
item
in
result_list
:
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
close11
(
cnx_
,
cursor_
)
#微信公众号定时任务
def
weixin_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
WeiXingetFromSql
,
'cron'
,
hour
=
12
,
minute
=
0
)
scheduler
.
add_job
(
WeiXingetFromSql
,
'cron'
,
hour
=
0
,
minute
=
0
)
try
:
#
redisPushData #
定时开始前执行一次
# 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
# 企业年报——雪球网
def
AnnualEnterpriseIPO
():
def
AnnualEnterpriseXueQ
():
cnx
,
cursor
=
connectSql
()
# 获取国内上市企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null and isIPO = 1 limit 10"
cursor
.
execute
(
gn_query
)
...
...
@@ -173,8 +232,24 @@ def AnnualEnterpriseIPO():
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnshqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#雪球网年报定时任务
def
AnnualEnterpriseXueQ_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每年执行一次
scheduler
.
add_job
(
AnnualEnterpriseXueQ
,
'cron'
,
month
=
1
,
day
=
1
,
hour
=
0
,
minute
=
0
)
try
:
# 定时开始前执行一次
AnnualEnterpriseXueQ
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#国外企业基本信息
def
BaseInfoEnterpriseAbroad
():
cnx
,
cursor
=
connectSql
()
# 获取国外企业
gn_query
=
"select id from EnterpriseInfo where Place = '2' limit 10 "
cursor
.
execute
(
gn_query
)
...
...
@@ -183,9 +258,24 @@ def BaseInfoEnterpriseAbroad():
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'BaseInfoEnterprise:gwqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#国外基本信息定时任务
def
BaseInfoAbroad_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个月执行一次
scheduler
.
add_job
(
BaseInfoEnterpriseAbroad
,
'cron'
,
day
=
1
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
BaseInfoEnterpriseAbroad
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#
#
福布斯=====从数据库中读取信息放入redis
#福布斯=====从数据库中读取信息放入redis
def
FBS
():
cnx
,
cursor
=
connectSql
()
# todo:调整为获取福布斯的数据库
# gw_query = "select id from EnterpriseInfo where ext1='fbs2000' and ext2='1' and Place=2"
# cursor.execute(gw_query)
...
...
@@ -198,17 +288,14 @@ def FBS():
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
# gw_social_list = [item[0] for item in gw_result]
#
# for item in gw_social_list:
# r.rpush('NewsEnterpriseFbs:gwqy_socialCode', item)
for
item
in
gn_social_list
:
if
not
r
.
exists
(
item
):
r
.
rpush
(
'NewsEnterpriseFbs:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
if
__name__
==
"__main__"
:
start
=
time
.
time
()
...
...
@@ -216,14 +303,13 @@ if __name__ == "__main__":
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterpriseAbroad()
#
NewsEnterprise_task()
NewsEnterprise_task
()
# NewsEnterprise()
# BaseInfoEnterprise()
# FBS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
NoticeEnterprise
()
NoticeEnterprise_task
()
AnnualEnterprise_task
()
# NoticeEnterprise()
# FinanceFromEast()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
# cnx.close()
# cursor.close()
# basecore.close()
comData/caiwushuju/东方财富网财务数据.py
0 → 100644
浏览文件 @
70f8ebff
"""
"""
修改东方财富网财务数据 存储redis的方式 修改成功
"""
import
requests
,
json
,
time
,
re
,
random
,
pymysql
,
redis
from
datetime
import
datetime
,
timedelta
import
pandas
as
pd
from
bs4
import
BeautifulSoup
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
# 判断股票代码是否存在
def
check_code
(
com_code
):
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
3
)
res
=
r
.
exists
(
'com_caiwushuju_code::'
+
com_code
)
#如果key存在 则不是第一次采集该企业, res = 1
if
res
:
return
False
#表示不是第一次采集
else
:
return
True
#表示是第一次采集
def
check_date
(
com_code
,
info_date
):
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
3
)
res
=
r
.
sismember
(
'com_caiwushuju_date::'
+
com_code
,
info_date
)
# 注意是 保存set的方式
if
res
:
return
True
else
:
return
False
# 将采集后的股票代码对应的报告期保存进redis
def
add_date
(
com_code
,
date_list
):
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
3
)
#遍历date_list 放入redis
for
date
in
date_list
:
res
=
r
.
sadd
(
'com_caiwushuju_code::'
+
com_code
,
date
)
# 根据信用代码、股票代码、报告时间采集三张表的数据
def
get_info
(
social_code
,
com_code
,
info_date
,
delist_all
,
info_date_list
,
taskType
):
dic_info
=
{}
# 第一次采集的股票代码做处理
for
nnn
in
range
(
0
,
3
):
try
:
ynFirst
=
check_code
(
com_code
)
break
except
:
time
.
sleep
(
1
)
#判断该报告期是否已采过
for
nnn
in
range
(
0
,
3
):
try
:
panduan
=
check_date
(
com_code
,
info_date
)
if
panduan
:
return
dic_info
else
:
pass
break
except
:
time
.
sleep
(
1
)
# 页面url,用于采集字段名称
url_name
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/Index?type=web&code={com_code}'
# print(f'url_name:{url_name}')
#todo:循环20次还是采集不到的记录
try
:
start_time
=
time
.
time
()
for
i
in
range
(
1
,
20
):
# 资产负债表,返回资产负债表json数据
url_data_zcfzb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/zcfzbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates={info_date}&code={com_code}'
# 利润表,返回利润表json数据
url_data_lrb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/lrbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates={info_date}&code={com_code}'
# 现金流量表,返回现金流量表json数据
url_data_xjllb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/xjllbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates={info_date}&code={com_code}'
res_data_zcfzb
=
requests
.
get
(
url_data_zcfzb1
)
res_data_lrb
=
requests
.
get
(
url_data_lrb1
)
res_data_xjllb
=
requests
.
get
(
url_data_xjllb1
)
#如果没有解析成功就继续循环
try
:
data_json_zcfzb
=
res_data_zcfzb
.
json
()[
'data'
][
0
]
print
(
f
'{info_date}第{i}次解析成功'
)
except
:
continue
#只要第一个能解析成功那其他的就都可以解析成功
data_json_lrb
=
res_data_lrb
.
json
()[
'data'
][
0
]
data_json_xjllb
=
res_data_xjllb
.
json
()[
'data'
][
0
]
res_name
=
requests
.
get
(
url_name
)
soup_name
=
BeautifulSoup
(
res_name
.
content
,
'html.parser'
)
#第一个表
try
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_qy'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_qs'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_yh'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
# bx
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_bx'
})
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
except
:
log
.
info
(
f
'---error: {social_code}, {com_code}---'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}资产负债表失败'
)
#第二个表
try
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_qy'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_qs'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
# zcfzb_yh
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_yh'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_bx'
})
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}利润表失败'
)
#第三个表
try
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_qy'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_qs'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_yh'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_bx'
})
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
except
:
log
.
info
(
f
'---error: {social_code}, {com_code}---'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}现金流量表失败'
)
list_zcfzb
=
[]
for
one_info
in
soup_zcfzb
.
find_all
(
'tr'
)[
2
:]:
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_zcfzb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_zcfzb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_zcfzb
.
append
(
dic_info_zcfzb
)
log
.
info
(
f
'----list_zcfzb:采集条数{len(list_zcfzb)}----'
)
list_lrb
=
[]
for
one_info
in
soup_lrb
.
find_all
(
'tr'
)[
2
:]:
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_lrb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_lrb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_lrb
.
append
(
dic_info_lrb
)
list_xjllb
=
[]
for
one_info
in
soup_xjllb
.
find_all
(
'tr'
)[
2
:]:
if
'补充资料'
in
one_info
.
text
:
break
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_xjllb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_xjllb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_xjllb
.
append
(
dic_info_xjllb
)
dic_info
=
{
"socialCreditCode"
:
social_code
,
"securitiesCode"
:
com_code
[
2
:],
"date"
:
info_date
,
"debt"
:
list_zcfzb
,
"profit"
:
list_lrb
,
"cash"
:
list_xjllb
,
"ynFirst"
:
ynFirst
,
}
#当前报告期加入列表
info_date_list
.
append
(
info_date
)
return
dic_info
except
:
start_time
=
time
.
time
()
try
:
for
i
in
range
(
1
,
20
):
# 资产负债表,返回资产负债表json数据
url_data_zcfzb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/zcfzbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates=2023-03-31
%2
C2022-12-31
%2
C2022-09-30
%2
C2022-06-30
%2
C2022-03-31&code={com_code}'
# 利润表,返回利润表json数据
url_data_lrb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/lrbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates=2023-03-31
%2
C2022-12-31
%2
C2022-09-30
%2
C2022-06-30
%2
C2022-03-31&code={com_code}'
# 现金流量表,返回现金流量表json数据
url_data_xjllb1
=
f
'https://emweb.eastmoney.com/PC_HSF10/NewFinanceAnalysis/xjllbAjaxNew?companyType={i}&reportDateType=0&reportType=1&dates=2023-03-31
%2
C2022-12-31
%2
C2022-09-30
%2
C2022-06-30
%2
C2022-03-31&code={com_code}'
res_data_zcfzb
=
requests
.
get
(
url_data_zcfzb1
)
res_data_lrb
=
requests
.
get
(
url_data_lrb1
)
res_data_xjllb
=
requests
.
get
(
url_data_xjllb1
)
# 如果没有解析成功就继续循环
try
:
data_json_zcfzb
=
res_data_zcfzb
.
json
()[
'data'
][
0
]
log
.
info
(
f
'----{com_code}---{info_date}--第{i}次解析成功-----'
)
except
:
continue
# 只要第一个能解析成功那其他的就都可以解析成功
data_json_lrb
=
res_data_lrb
.
json
()[
'data'
][
0
]
data_json_xjllb
=
res_data_xjllb
.
json
()[
'data'
][
0
]
res_name
=
requests
.
get
(
url_name
)
soup_name
=
BeautifulSoup
(
res_name
.
content
,
'html.parser'
)
# 第一个表
try
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_qy'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_qs'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_yh'
})
if
script_zcfzb
:
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
# bx
else
:
script_zcfzb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'zcfzb_bx'
})
soup_zcfzb
=
BeautifulSoup
(
script_zcfzb
.
text
.
strip
(),
'lxml'
)
except
:
log
.
info
(
f
'---error: {social_code}, {com_code}---'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}资产负债表失败'
)
# 第二个表
try
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_qy'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_qs'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
# zcfzb_yh
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_yh'
})
if
script_lrb
:
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
else
:
script_lrb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'lrb_bx'
})
soup_lrb
=
BeautifulSoup
(
script_lrb
.
text
.
strip
(),
'lxml'
)
except
:
log
.
info
(
f
'---error: {social_code}, {com_code}---'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}利润表失败'
)
# 第三个表
try
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_qy'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_qs'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_yh'
})
if
script_xjllb
:
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
else
:
script_xjllb
=
soup_name
.
find
(
'script'
,
{
'id'
:
'xjllb_bx'
})
soup_xjllb
=
BeautifulSoup
(
script_xjllb
.
text
.
strip
(),
'lxml'
)
except
:
log
.
info
(
f
'---error: {social_code}, {com_code}---'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}现金流量表失败'
)
list_zcfzb
=
[]
for
one_info
in
soup_zcfzb
.
find_all
(
'tr'
)[
2
:]:
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_zcfzb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_zcfzb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_zcfzb
.
append
(
dic_info_zcfzb
)
log
.
info
(
f
'----list_zcfzb:采集条数{len(list_zcfzb)}----'
)
list_lrb
=
[]
for
one_info
in
soup_lrb
.
find_all
(
'tr'
)[
2
:]:
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_lrb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_lrb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_lrb
.
append
(
dic_info_lrb
)
list_xjllb
=
[]
for
one_info
in
soup_xjllb
.
find_all
(
'tr'
)[
2
:]:
if
'补充资料'
in
one_info
.
text
:
break
if
'value.'
not
in
one_info
.
text
:
continue
info_name
=
one_info
.
find
(
'span'
)
.
text
if
'审计意见'
in
info_name
:
continue
info_name_en
=
re
.
findall
(
'value
\
.(.*?)
\
)}}'
,
one_info
.
text
)[
0
]
if
info_name_en
:
try
:
info_data
=
data_json_xjllb
[
info_name_en
]
except
:
continue
else
:
continue
if
not
info_data
:
info_data
=
'--'
dic_info_xjllb
=
{
"name"
:
info_name
,
'enName'
:
info_name_en
,
"value"
:
info_data
}
list_xjllb
.
append
(
dic_info_xjllb
)
dic_info
=
{
"socialCreditCode"
:
social_code
,
"securitiesCode"
:
com_code
[
2
:],
"date"
:
info_date
,
"debt"
:
list_zcfzb
,
"profit"
:
list_lrb
,
"cash"
:
list_xjllb
,
"ynFirst"
:
ynFirst
,
}
info_date_list
.
append
(
info_date
)
return
dic_info
except
:
# delist_json = {'info_date':info_date,'com_code': com_code, 'social_code': social_code}
log
.
info
(
f
'---{info_date}报告期无数据,股票代码:{com_code}----'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url_name
,
f
'{info_date}--报告期无数据--{com_code}'
)
#如果本期无数据 就把日期记录下来
delist_all
.
append
(
info_date
)
def
getReportTime
():
# timeNow = baseCore.getNowTime(1)[:10]
list_date
=
[]
# 2023-04-01
# 获取当前日期和时间
current_date
=
datetime
.
now
()
# 计算昨天的日期
yesterday
=
current_date
-
timedelta
(
days
=
1
)
# 格式化昨天的日期
report_date
=
yesterday
.
strftime
(
'
%
Y-
%
m-
%
d'
)
list_date
.
append
(
report_date
)
year
=
int
(
current_date
.
strftime
(
'
%
Y'
))
# list_date = ['2023-03-31']
list_month
=
[
'-12-31'
,
'-09-30'
,
'-06-30'
,
'-03-31'
]
for
year
in
range
(
2022
,
2018
,
-
1
):
for
month
in
list_month
:
date
=
str
(
year
)
+
month
list_date
.
append
(
date
)
return
list_date
def
job
(
taskType
):
# 将上市企业库中的全部A股代码存入list
# 需要提供股票代码、企业信用代码
while
True
:
#从redis中获取企业信用代码
social_code
=
baseCore
.
redicPullData
(
'FinanceFromEast:finance_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
continue
sql_sel
=
f
'''select securities_code,exchange from sys_base_enterprise_ipo where category = '1' and social_credit_code='{social_code}' '''
cursor
.
execute
(
sql_sel
)
row
=
cursor
.
fetchone
()
securities_code
=
row
[
0
]
exchange
=
row
[
1
]
# for code in list_code:
# social_code = rows[0]
# exchange = rows[2]
# if code==rows[1]:
# securities_code = code
# else:
# continue
if
exchange
==
1
:
com_code
=
'bj'
+
securities_code
if
exchange
==
2
:
com_code
=
'sh'
+
securities_code
if
exchange
==
3
:
com_code
=
'sz'
+
securities_code
# if com_code=='sz002163':
list_date
=
getReportTime
()
delist
=
[]
# 记录该企业所有无数据的报告期
date_list
=
[]
# 记录该企业所有数据的报告期
start_time
=
time
.
time
()
# 分别对每个报告期进行采集
for
info_date
in
list_date
:
delist_all
=
[]
info_date_list
=
[]
dic_info
=
get_info
(
social_code
,
com_code
,
info_date
,
delist_all
,
info_date_list
,
taskType
)
# print(dic_info)
# 将采集后的报告期存入redis
if
len
(
dic_info
)
!=
0
:
# 调凯歌接口存储数据
data
=
json
.
dumps
(
dic_info
)
# print(data)
url_baocun
=
'http://114.115.236.206:8088/sync/finance/df'
for
nnn
in
range
(
0
,
3
):
try
:
res_baocun
=
requests
.
post
(
url_baocun
,
data
=
data
)
break
except
:
time
.
sleep
(
1
)
print
(
res_baocun
.
text
)
if
len
(
info_date_list
)
!=
0
:
for
date
in
info_date_list
:
date_list
.
append
(
date
)
print
(
date_list
)
date_list
=
str
(
date_list
)
for
nnn
in
range
(
0
,
3
):
try
:
add_date
(
com_code
,
date_list
)
break
except
:
time
.
sleep
(
1
)
end_time
=
time
.
time
()
log
.
info
(
f
'===={com_code}====该企业耗时{end_time-start_time}==='
)
cnx
.
close
()
cursor
.
close
()
baseCore
.
close
()
if
__name__
==
'__main__'
:
task_type
=
'财务数据/东方财富网'
job
(
task_type
)
comData/dfcfwGpdm/gpdm.py
浏览文件 @
70f8ebff
...
...
@@ -20,7 +20,9 @@ headers={
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
def
getTotal
(
pageSize
):
taskType
=
'股票代码/东方财富网'
def
getTotal
(
pageSize
,
start
):
total
=
0
for
num
in
range
(
3
):
try
:
...
...
@@ -34,10 +36,15 @@ def getTotal(pageSize):
content
=
baseCore
.
getSubStr
(
content
,
'{'
,
'}'
)
retJson
=
json
.
loads
(
content
)
total
=
retJson
[
'data'
][
'total'
]
response
.
close
()
break
except
Exception
as
e
:
log
.
info
(
f
"------第{num}次出错---{e}"
)
continue
exception
=
'链接失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
''
,
taskType
,
state
,
takeTime
,
'http://quote.eastmoney.com/center/gridlist.html?st=ChangePercent&sortType=C&sortRule=-1#hs_a_board'
,
exception
)
return
total
...
...
@@ -67,21 +74,30 @@ def getPageDta(pageIndex,pageSize,totalPage):
continue
else
:
log
.
info
(
f
"{gpdm}-------{name}---新增"
)
insertSql
=
f
"insert into gpdm(gpdm,name,state,create_date) values ('{gpdm}','{name}',1,now())"
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
insertSql
=
f
"insert into gpdm(gpdm,name,state,create_date) values ('{gpdm}','{name}',1,now())"
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
response
.
close
()
log
.
info
(
f
"【{pageIndex}/{totalPage}】-----------end,耗时{baseCore.getTimeCost(start, time.time())}"
)
break
except
Exception
as
e
:
log
.
info
(
f
"------第{num}次出错---{e}"
)
continue
exception
=
f
'第{pageIndex}页链接失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
''
,
taskType
,
state
,
takeTime
,
''
,
exception
)
def
doJob
():
pageSize
=
20
total
=
getTotal
(
pageSize
)
start_time
=
time
.
time
(
)
total
=
getTotal
(
pageSize
,
start_time
)
if
total
==
0
:
exception
=
'股票代码总数为零'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
''
,
taskType
,
state
,
takeTime
,
'http://quote.eastmoney.com/center/gridlist.html?st=ChangePercent&sortType=C&sortRule=-1#hs_a_board'
,
exception
)
log
.
info
(
f
"股票代码总数-----------{total},请检查"
)
return
log
.
info
(
f
"股票代码总数-----------{total}"
)
...
...
@@ -91,6 +107,9 @@ def doJob():
totalPage
=
total
//
pageSize
+
1
for
pageIndex
in
range
(
1
,
totalPage
+
1
):
getPageDta
(
pageIndex
,
pageSize
,
totalPage
)
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
''
,
taskType
,
state
,
takeTime
,
''
,
''
)
# 释放资源
cursor
.
close
()
cnx
.
close
()
...
...
comData/weixin_solo/oneWeixin.py
浏览文件 @
70f8ebff
...
...
@@ -41,30 +41,6 @@ def add_url(sid, article_url):
else
:
return
False
def
get_proxy
():
cnx
=
pymysql
.
connect
(
host
=
"114.115.159.144"
,
user
=
"root"
,
password
=
"zzsn9988"
,
db
=
"clb_project"
,
charset
=
"utf8mb4"
)
with
cnx
.
cursor
()
as
cursor
:
sql
=
"select proxy from clb_proxy"
cursor
.
execute
(
sql
)
proxy_lists
=
cursor
.
fetchall
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"HTTP"
:
proxyMeta
,
"HTTPS"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
#定时
def
getFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
cursor
.
execute
(
selectSql
)
...
...
@@ -93,7 +69,7 @@ def flushAndGetToken(list_b):
def
rePutIntoR
(
item
):
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
def
get_info
(
sid
,
json_search
,
origin
,
info_source_cod
e
):
def
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
pag
e
):
list_all_info
=
[]
num_caiji
=
0
kaishi_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -103,7 +79,7 @@ def get_info(sid,json_search,origin,info_source_code):
server
=
'https://obs.cn-north-1.myhuaweicloud.com'
# 你的桶的地址
)
list_all_news
=
json_search
[
'app_msg_list'
]
#采集第几篇文章
for
one_news
in
list_all_news
:
news_title
=
one_news
[
'title'
]
timestamp
=
one_news
[
'create_time'
]
...
...
@@ -114,10 +90,12 @@ def get_info(sid,json_search,origin,info_source_code):
url_ft
=
check_url
(
sid
,
url_news
)
if
url_ft
:
log
.
info
(
f
'
已采过该篇文章----文章链接-----{url_news}
'
)
log
.
info
(
f
'
-----{origin}--第{page}页--已采过该篇文章--文章链接--{url_news}-----
'
)
return
list_all_info
,
num_caiji
try
:
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
)
ip
=
baseCore
.
get_proxy
()
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
,
proxies
=
ip
)
time
.
sleep
(
2
)
except
:
continue
soup_news
=
BeautifulSoup
(
res_news
.
content
,
'html.parser'
)
...
...
@@ -132,16 +110,17 @@ def get_info(sid,json_search,origin,info_source_code):
try
:
news_content
=
news_html
.
text
except
:
log
.
info
(
f
'----
----内容为空--------{url_news}---
-----'
)
log
.
info
(
f
'----
{origin}--第{page}页--该篇文章内容为空--{url_news}
-----'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
news_title
,
url_
news
,
news_html
,
url_
,
info_source_code
,
'文章内容为空'
,
time_now
time_now
,
url_news
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,
json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s)"
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,
info_source_code,error_type,create_time,news_url) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
continue
...
...
@@ -182,6 +161,8 @@ def get_info(sid,json_search,origin,info_source_code):
section
.
name
=
'div'
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#将信息传输到kafka中
dic_info
=
{
'sid'
:
sid
,
'title'
:
news_title
,
...
...
@@ -196,8 +177,8 @@ def get_info(sid,json_search,origin,info_source_code):
'createDate'
:
time_now
}
for
nnn
in
range
(
0
,
3
):
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_info
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_time_out
=
kafka_result
.
get
(
timeout
=
10
)
add_url
(
sid
,
url_news
)
...
...
@@ -205,8 +186,11 @@ def get_info(sid,json_search,origin,info_source_code):
except
:
time
.
sleep
(
5
)
continue
finally
:
producer
.
close
()
num_caiji
=
num_caiji
+
1
list_all_info
.
append
(
dic_info
)
time
.
sleep
(
5
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info2
=
{
'infoSourceId'
:
sid
,
...
...
@@ -235,18 +219,18 @@ def job(count,key):
log
.
info
(
'===========获取公众号============'
)
start_
=
time
.
time
()
#
#todo:redis中数据 pop一条
#todo:redis中数据 pop一条
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
if
infoSourceCode
==
'None'
:
if
infoSourceCode
==
'None'
or
infoSourceCode
==
None
:
#当一次采集完之后,重新插入数据并等待插入完成
getFromSql
()
time
.
sleep
(
20
)
log
.
info
(
f
'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}'
)
return
return
count
sql
=
f
"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛'
# sql = f"
SELECT site_uri,id,site_name,info_source_code from info_source where site_name = '一带一路百人论坛
' "
# sql = f"
-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436
' "
cursor
.
execute
(
sql
)
row
=
cursor
.
fetchone
()
...
...
@@ -282,28 +266,28 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
fakeid
=
biz
+
'=='
return
count
fakeid
=
biz
+
'=='
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
#
https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
#
获取页数
try
:
ip
=
get_proxy
()[
random
.
randint
(
0
,
3
)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
proxies
=
ip
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
# ip = baseCore.get_proxy()
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
return
#{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
time
.
sleep
(
20
)
return
count
ret
=
json_search
[
'base_resp'
][
'ret'
]
# {"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
# TODO:需要判断返回值,根据返回值判断是封号还是biz错误
# {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}========= 封号
# {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}} 公众号biz错误 链接
# 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
ret
=
json_search
[
'base_resp'
][
'ret'
]
if
ret
==
0
:
pass
elif
ret
==
200013
:
...
...
@@ -318,10 +302,9 @@ def job(count,key):
# log.info('=======等待时间600秒=====刷新浏览器=====')
# browser_run = list_b[0]
# browser_run.refresh()
r
.
set
(
key
,
50
)
r
.
expire
(
key
,
36
00
)
return
r
.
expire
(
key
,
54
00
)
return
count
elif
ret
==
200002
:
# 公众号链接错误 保存库里 记录错误信息及错误类型
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -336,8 +319,8 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_,
time.time())}'
)
return
count
elif
ret
==
200003
:
# 无效的session
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -353,7 +336,7 @@ def job(count,key):
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
return
return
count
else
:
log
.
info
(
f
'----其他情况-----{json_search}---公众号{origin}------'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -368,55 +351,84 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
list_all
=
json_search
[
'app_msg_list'
]
return
count
try
:
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
info_source_code
)
print
(
f
'----------{len(list_all_info)}------{num_caiji}-------'
)
time
.
sleep
(
2
)
if
len
(
list_all_info
)
!=
0
:
count
+=
1
Max_data
=
int
(
json_search
[
'app_msg_cnt'
])
Max_page
=
int
(
int
(
json_search
[
'app_msg_cnt'
])
/
5
)
if
int
(
json_search
[
'app_msg_cnt'
])
%
5
!=
0
:
Max_page
=
Max_page
+
1
else
:
Max_page
=
Max_page
except
:
Max_page
=
1
Max_data
=
5
log
.
info
(
f
'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----'
)
for
i
in
range
(
0
,
Max_data
,
5
):
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
try
:
# ip = get_proxy()[random.randint(0, 3)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
return
count
list_all
=
json_search
[
'app_msg_list'
]
try
:
#开始采集每一页文章信息
page
=
int
(
i
/
5
+
1
)
log
.
info
(
f
'---{origin}---------开始采集第{page}个分页-----------'
)
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
page
)
print
(
f
'----第{page}页采集到文章个数-----{len(list_all_info)}------{num_caiji}-------'
)
time
.
sleep
(
2
)
if
len
(
list_all_info
)
!=
0
:
count
+=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
success
=
[
origin
,
url_
,
info_source_code
,
'采集成功'
,
num_caiji
,
time_now
,
]
#成功信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
cnx_
.
commit
()
# 该公众号的所有文章采集完成
log
.
info
(
f
'---第{page}页采集到文章个数---{len(list_all_info)}---{num_caiji}---耗时{baseCore.getTimeCost(start_,time.time())}'
)
else
:
log
.
info
(
f
'----第{page}页采集到文章个数{num_caiji}--网址已存在!-----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
count
except
Exception
as
e
:
# json解析该公众号成功但采集数据失败
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
success
=
[
false
=
[
origin
,
url_
,
info_source_code
,
'采集成功'
,
num_caiji
,
e
,
'采集失败'
,
time_now
]
#
成功
信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,
success_info,success_num
,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
#
失败
信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,
json_error_info,error_type
,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
# 该公众号的所有文章采集完成
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_,time.time())}'
)
else
:
log
.
info
(
f
'{fakeid}、公众号{origin}、网址已存在!耗时{baseCore.getTimeCost(start_,time.time())}'
)
except
Exception
as
e
:
# json解析该公众号成功但采集数据失败
count
+=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
origin
,
url_
,
info_source_code
,
e
,
'采集失败'
,
time_now
]
# 失败信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
count
+=
1
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}'
)
time
.
sleep
(
2
)
return
count
if
__name__
==
"__main__"
:
requests
.
DEFAULT_RETRIES
=
5
time_start
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -456,15 +468,19 @@ if __name__=="__main__":
# 设置计数器的初始值为0
key
=
baseCore
.
getNextSeq
()
r
.
set
(
key
,
0
)
# 设置key的过期时间为
10秒
r
.
expire
(
key
,
36
00
)
# 设置key的过期时间为
一个半小时
r
.
expire
(
key
,
54
00
)
while
True
:
time
.
sleep
(
2
)
new_value
=
baseCore
.
incrSet
(
key
)
baseCore
.
getttl
(
key
)
if
new_value
<
50
:
aa
=
job
(
count
,
key
)
count
=
aa
try
:
aa
=
job
(
count
,
key
)
count
=
aa
time
.
sleep
(
20
)
except
:
time
.
sleep
(
10
)
else
:
#刷新浏览器
browser_run
=
list_b
[
0
]
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论