Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
07b5b32c
提交
07b5b32c
authored
8月 24, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
2509556c
eaa6815d
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
136 行增加
和
114 行删除
+136
-114
BaseCore.py
base/BaseCore.py
+2
-2
RedisPPData.py
base/RedisPPData.py
+39
-16
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+95
-96
没有找到文件。
base/BaseCore.py
浏览文件 @
07b5b32c
...
...
@@ -228,10 +228,10 @@ class BaseCore:
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
def
__init__
(
self
):
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'
root
'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'
caiji
'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'
root
'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'
caiji
'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
self
.
cursor
=
self
.
cnx
.
cursor
()
...
...
base/RedisPPData.py
浏览文件 @
07b5b32c
...
...
@@ -171,18 +171,40 @@ def BaseInfoEnterprise_task():
pass
#企业核心人员
def
CorPerson
():
cnx
,
cursor
=
connectSql
()
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'CorPersonEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业核心人员定时任务:
def
CorPerson_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个月执行一次
scheduler
.
add_job
(
CorPerson
,
'cron'
,
day
=
'1'
,
hour
=
0
,
minute
=
0
)
try
:
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#东方财富网财务数据
def
FinanceFromEast
():
cnx_
,
cursor_
=
cnn11
()
#从上市企业库中读取数据
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where category = '1'
limit 10
'''
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where category = '1' '''
cursor_
.
execute
(
sql_sel
)
finance
=
cursor_
.
fetchall
()
finance_list
=
[
item
[
0
]
for
item
in
finance
]
print
(
'======='
)
for
item
in
finance_list
:
r
.
rpush
(
'FinanceFromEast:finance_socialCode'
,
item
)
r
.
rpush
(
'FinanceFromEast:
east
finance_socialCode'
,
item
)
close11
(
cnx_
,
cursor_
)
#东方财富网财务数据定时任务
...
...
@@ -192,7 +214,6 @@ def FinanceFromEase_task():
# 每个季度执行一次
scheduler
.
add_job
(
FinanceFromEast
,
'cron'
,
month
=
'1-12/3'
,
day
=
'1'
,
hour
=
0
,
minute
=
0
)
try
:
# 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
...
...
@@ -250,11 +271,11 @@ def AnnualEnterpriseXueQ_task():
print
(
'定时采集异常'
,
e
)
pass
#国外企业基本信息
#国外企业基本信息
redis中放入id
def
BaseInfoEnterpriseAbroad
():
cnx
,
cursor
=
connectSql
()
# 获取国外企业
gn_query
=
"select
SocialCode
from EnterpriseInfo where Place = '2' "
gn_query
=
"select
id
from EnterpriseInfo where Place = '2' "
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
...
...
@@ -280,24 +301,26 @@ def BaseInfoAbroad_task():
def
FBS
():
cnx
,
cursor
=
connectSql
()
# todo:调整为获取福布斯的数据库
# gw_query = "select id from EnterpriseInfo where ext1='fbs2000' and ext2='1' and
Place=2"
#
cursor.execute(gw_query)
#
gw_result = cursor.fetchall()
gw_query
=
"select a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.
Place=2"
cursor
.
execute
(
gw_query
)
gw_result
=
cursor
.
fetchall
()
#获取国内企业
gn_query
=
"select
id from EnterpriseInfo where ext1='fbs2000' and ext2='1' and Place=1
"
gn_query
=
"select
a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.Place=1
"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
# gw_social_list = [item[0] for item in gw_result]
# for item in gw_social_list:
# r.rpush('NewsEnterpriseFbs:gwqy_socialCode', item)
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterpriseFbs:gwqy_socialCode'
,
item
)
r
.
rpush
(
'BaseInfoEnterpriseFbs:gwqy_social_code'
,
item
)
for
item
in
gn_social_list
:
if
not
r
.
exists
(
item
):
r
.
rpush
(
'NewsEnterpriseFbs:gnqy_socialCode'
,
item
)
r
.
rpush
(
'NoticeEnterpriseFbs:gnqy_socialCode'
,
item
)
r
.
rpush
(
'BaseInfoEnterpriseFbs:gnqy_social_code'
,
item
)
closeSql
(
cnx
,
cursor
)
#将IPO的国外股票代码放到redis中
...
...
@@ -310,7 +333,7 @@ def yahooCodeFromSql():
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'
NoticeEnterprise:
securities_code'
,
item
)
r
.
rpush
(
'
FinanceFromEast:yahoo_
securities_code'
,
item
)
except
Exception
as
e
:
log
.
info
(
"数据查询异常"
)
finally
:
...
...
@@ -337,11 +360,11 @@ if __name__ == "__main__":
# NoticeEnterprise()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
BaseInfoEnterpriseAbroad
()
#
BaseInfoEnterpriseAbroad()
# NewsEnterprise_task()
# NewsEnterprise()
# BaseInfoEnterprise()
#
FBS()
FBS
()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# NoticeEnterprise()
...
...
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
07b5b32c
impor
t
json
impor
t
json
import
json
import
time
import
numpy
as
np
import
pandas
as
pd
import
pymysql
import
requests
import
sys
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
NewsYahoo
import
news
from
base.BaseCore
import
BaseCore
sys
.
path
.
append
(
r'F:\zzsn\zzsn_spider\base'
)
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
taskType
=
'企业基本信息/雅虎财经'
baseCore
=
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
r
=
baseCore
.
r
log
=
baseCore
.
getLogger
()
headers
=
{
...
...
@@ -38,7 +34,7 @@ headers = {
# 根据股票代码 获取企业基本信息 高管信息
def
getInfo
(
name
,
enname
,
gpdm
,
xydm
,
start
):
def
getInfo
(
enname
,
gpdm
,
xydm
,
start
):
if
'HK'
in
str
(
gpdm
):
tmp_g
=
str
(
gpdm
)
.
split
(
'.'
)[
0
]
if
len
(
tmp_g
)
==
5
:
...
...
@@ -49,17 +45,9 @@ def getInfo(name,enname,gpdm, xydm, start):
gpdm_
=
gpdm
retData
=
{}
retData
[
'base_info'
]
=
{
'公司名称'
:
name
,
'公司名称'
:
en
name
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'地址'
:
''
,
'电话'
:
''
,
'公司网站'
:
''
,
'部门'
:
''
,
'行业'
:
''
,
'员工人数'
:
''
,
'公司简介'
:
''
}
retData
[
'people_info'
]
=
[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
...
...
@@ -76,22 +64,36 @@ def getInfo(name,enname,gpdm, xydm, start):
log
.
error
(
f
"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}"
)
except
:
continue
if
(
response
.
status_code
==
200
):
pass
else
:
try
:
if
'lookup'
in
response
.
url
:
log
.
error
(
f
"{gpdm}------股票代码错误:{response.status_code}"
)
exeception
=
'股票代码错误'
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
0
,
takeTime
,
url
,
exeception
)
return
[
state
,
retData
]
elif
response
.
status_code
!=
200
:
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
exeception
=
'获取基本信息接口返回失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise:gwqy_socialCode'
,
xydm
)
return
[
state
,
retData
]
except
:
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
exeception
=
'获取基本信息接口返回失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
rePutIntoR
(
''
)
return
[
state
,
retData
]
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise:gwqy_socialCode'
,
xydm
)
return
[
state
,
retData
]
state
=
1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
page
=
soup
.
find
(
'div'
,
{
'id'
:
'Col1-0-Profile-Proxy'
})
name
=
page
.
find
(
'h3'
,{
'class'
:
'Fz(m) Mb(10px)'
})
.
text
name
=
page
.
find
(
'h3'
,
{
'class'
:
'Fz(m) Mb(10px)'
})
.
text
try
:
com_info
=
page
.
find
(
'div'
,
{
'class'
:
'Mb(25px)'
})
except
:
...
...
@@ -126,7 +128,7 @@ def getInfo(name,enname,gpdm, xydm, start):
com_jianjie
=
''
dic_com_info
=
{
'公司名称'
:
name
,
'英文名'
:
en
name
,
'英文名'
:
name
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'地址'
:
com_address
,
...
...
@@ -189,24 +191,31 @@ def getInfo(name,enname,gpdm, xydm, start):
retData
[
'people_info'
]
=
retPeople
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
response
.
close
()
return
[
state
,
retData
]
return
[
state
,
retData
]
# 保存基本信息
def
saveBaseInfo
(
info
,
start
):
def
saveBaseInfo
(
info
,
start
):
# 基本信息发送到kafka
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
],
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
],
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
],
# 简介
'industry'
:
info
[
'base_info'
][
'行业'
],
# 所属行业
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
],
# 地址
'status'
:
0
,
# 状态
}
try
:
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
],
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
],
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
],
# 简介
'industry'
:
info
[
'base_info'
][
'行业'
],
# 所属行业
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
],
# 地址
'status'
:
0
,
# 状态
}
except
:
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
}
# print(company_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
...
@@ -216,7 +225,7 @@ def saveBaseInfo(info,start):
# 保存高管信息
def
savePeopleInfo
(
info
,
start
):
def
savePeopleInfo
(
info
,
start
):
# 高管信息调用接口
list_people
=
info
[
'people_info'
]
list_one_info
=
[]
...
...
@@ -240,6 +249,7 @@ def savePeopleInfo(info,start):
json_updata
=
json
.
dumps
(
list_one_info
)
# print(json_updata)
if
json_updata
==
'[]'
:
log
.
info
(
"没有高管"
)
pass
else
:
for
i
in
range
(
0
,
3
):
...
...
@@ -274,18 +284,6 @@ def savePeopleInfo(info,start):
return
state
def
rePutIntoR
(
item
):
r
.
rpush
(
'BaseInfoEnterprise:gwqy_socialCode'
,
item
)
# def getInfomation(social_code):
# sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# cursor.execute(sql)
# data = cursor.fetchone()
# return data
# 采集工作
def
beginWork
():
while
True
:
...
...
@@ -298,65 +296,66 @@ def beginWork():
continue
# 数据库中获取基本信息
data
=
baseCore
.
getInfomation
(
social_code
)
name
=
data
[
1
]
enname
=
data
[
5
]
gpdm
=
data
[
3
]
gpdm
=
'0123'
xydm
=
data
[
2
]
# 获取该企业对应项目的采集次数
count
=
data
[
13
]
start_time
=
time
.
time
()
# 股票代码为空跳过
if
gpdm
is
None
:
log
.
error
(
f
"{name}--股票代码为空 跳过"
)
exception
=
'股票代码为空'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
continue
try
:
retData
=
getInfo
(
name
,
enname
,
gpdm
,
xydm
,
start_time
)
# 基本信息采集成功 进行数据入库,否则不入库
if
retData
[
0
]
==
1
:
# 企业基本信息入库
try
:
saveBaseInfo
(
retData
[
1
],
start_time
)
except
:
log
.
error
(
f
'{name}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
# 企业高管信息入库
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if
state
==
1
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
''
)
if
gpdm
==
''
:
info
=
{
"base_info"
:
{
'公司名称'
:
enname
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
}}
log
.
error
(
f
'{xydm}....股票代码为空'
)
try
:
saveBaseInfo
(
info
,
start_time
)
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
else
:
try
:
retData
=
getInfo
(
enname
,
gpdm
,
xydm
,
start_time
)
# 基本信息采集成功 进行数据入库,否则不入库
if
retData
[
0
]
==
1
:
# 企业基本信息入库
try
:
saveBaseInfo
(
retData
[
1
],
start_time
)
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
# 企业高管信息入库
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if
state
==
1
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
''
)
else
:
pass
else
:
pass
else
:
pass
except
Exception
as
e
:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
ee
=
e
.
__traceback__
.
tb_lineno
log
.
error
(
f
'{name}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
except
Exception
as
e
:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
ee
=
e
.
__traceback__
.
tb_lineno
log
.
error
(
f
'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
# 企业数据采集完成,采集次数加一
count
+=
1
runType
=
'BaseInfoRunCount'
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
# 释放资源
baseCore
.
close
()
if
__name__
==
'__main__'
:
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
beginWork
()
cursor
.
close
()
cnx
.
close
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论