Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
1de3a9f5
提交
1de3a9f5
authored
9月 13, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
d667977d
70938797
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
42 行增加
和
29 行删除
+42
-29
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+13
-11
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+29
-18
没有找到文件。
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
1de3a9f5
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -162,12 +162,12 @@ def scroll(xydm,name,gpdm):
...
@@ -162,12 +162,12 @@ def scroll(xydm,name,gpdm):
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
break
break
# todo:增量时 需打开注释
# todo:增量时 需打开注释
#
try:
try
:
#
selects = selectUrl(last_url_,xydm)
selects
=
selectUrl
(
last_url_
,
xydm
)
#
except:
except
:
#
break
break
#
if selects:
if
selects
:
#
break
break
if
last_url_
==
last_url
:
if
last_url_
==
last_url
:
break
break
last_url_
=
last_url
last_url_
=
last_url
...
@@ -178,7 +178,7 @@ def rePutIntoR(item):
...
@@ -178,7 +178,7 @@ def rePutIntoR(item):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
path
=
r'
D:\chrome
\chromedriver.exe'
path
=
r'
F:\spider\1
\chromedriver.exe'
driver
=
baseCore
.
buildDriver
(
path
)
driver
=
baseCore
.
buildDriver
(
path
)
cnx
=
baseCore
.
cnx
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
cursor
=
baseCore
.
cursor
...
@@ -186,7 +186,7 @@ if __name__ == "__main__":
...
@@ -186,7 +186,7 @@ if __name__ == "__main__":
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise
Fbs
:gwqy_socialCode'
)
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gwqy_socialCode'
)
# social_code = 'ZZSN22080900000046'
# social_code = 'ZZSN22080900000046'
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
...
@@ -207,6 +207,8 @@ if __name__ == "__main__":
...
@@ -207,6 +207,8 @@ if __name__ == "__main__":
gpdm
=
str
(
gpdm
)[
1
:]
gpdm
=
str
(
gpdm
)[
1
:]
else
:
else
:
pass
pass
elif
str
(
gpdm
)[
-
2
:]
==
'.N'
or
str
(
gpdm
)[
-
2
:]
==
'.O'
:
gpdm
=
gpdm
[:
-
2
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
# 获取该企业对应项目的采集次数
# 获取该企业对应项目的采集次数
...
@@ -280,9 +282,9 @@ if __name__ == "__main__":
...
@@ -280,9 +282,9 @@ if __name__ == "__main__":
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
news_url
,
exception
)
# 增量使用
# 增量使用
#
break
break
# 全量使用
# 全量使用
continue
#
continue
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
exception
=
getZx
(
xydm
,
news_url
,
title
,
cnx
,
path
)
exception
=
getZx
(
xydm
,
news_url
,
title
,
cnx
,
path
)
if
exception
==
''
:
if
exception
==
''
:
...
...
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
1de3a9f5
impor
t
json
impor
t
json
...
@@ -4,8 +4,9 @@ import requests
...
@@ -4,8 +4,9 @@ import requests
import
sys
import
sys
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
r'F:\zzsn\zzsn_spider\base'
)
# sys.path.append(r'F:\zzsn\zzsn_spider\base')
import
BaseCore
# import BaseCore
from
base
import
BaseCore
import
urllib3
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
...
@@ -20,7 +21,6 @@ headers = {
...
@@ -20,7 +21,6 @@ headers = {
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
,
'accept-language'
:
'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
,
'cache-control'
:
'max-age=0'
,
'cache-control'
:
'max-age=0'
,
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua'
:
'"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"'
,
'sec-ch-ua'
:
'"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
"Windows"
,
'sec-ch-ua-platform'
:
"Windows"
,
...
@@ -41,6 +41,8 @@ def getInfo(enname, gpdm, xydm, start):
...
@@ -41,6 +41,8 @@ def getInfo(enname, gpdm, xydm, start):
gpdm_
=
str
(
gpdm
)[
1
:]
gpdm_
=
str
(
gpdm
)[
1
:]
else
:
else
:
pass
pass
elif
str
(
gpdm
)[
-
2
:]
==
'.N'
or
str
(
gpdm
)[
-
2
:]
==
'.O'
:
gpdm_
=
gpdm
[:
-
2
]
else
:
else
:
gpdm_
=
gpdm
gpdm_
=
gpdm
retData
=
{}
retData
=
{}
...
@@ -50,7 +52,6 @@ def getInfo(enname, gpdm, xydm, start):
...
@@ -50,7 +52,6 @@ def getInfo(enname, gpdm, xydm, start):
'信用代码'
:
xydm
,
'信用代码'
:
xydm
,
}
}
retData
[
'people_info'
]
=
[]
retData
[
'people_info'
]
=
[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url
=
f
'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
url
=
f
'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time
.
sleep
(
3
)
time
.
sleep
(
3
)
...
@@ -78,7 +79,7 @@ def getInfo(enname, gpdm, xydm, start):
...
@@ -78,7 +79,7 @@ def getInfo(enname, gpdm, xydm, start):
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise
:gwqy_socialC
ode'
,
xydm
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise
Fbs:gwqy_social_c
ode'
,
xydm
)
return
[
state
,
retData
]
return
[
state
,
retData
]
except
:
except
:
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
...
@@ -86,7 +87,7 @@ def getInfo(enname, gpdm, xydm, start):
...
@@ -86,7 +87,7 @@ def getInfo(enname, gpdm, xydm, start):
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise
:gwqy_socialC
ode'
,
xydm
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise
Fbs:gwqy_social_c
ode'
,
xydm
)
return
[
state
,
retData
]
return
[
state
,
retData
]
state
=
1
state
=
1
...
@@ -216,7 +217,6 @@ def saveBaseInfo(info, start):
...
@@ -216,7 +217,6 @@ def saveBaseInfo(info, start):
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
}
}
# print(company_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
.
get
(
timeout
=
10
)
kafka_result
.
get
(
timeout
=
10
)
...
@@ -247,7 +247,6 @@ def savePeopleInfo(info, start):
...
@@ -247,7 +247,6 @@ def savePeopleInfo(info, start):
}
}
list_one_info
.
append
(
dic_json
)
list_one_info
.
append
(
dic_json
)
json_updata
=
json
.
dumps
(
list_one_info
)
json_updata
=
json
.
dumps
(
list_one_info
)
# print(json_updata)
if
json_updata
==
'[]'
:
if
json_updata
==
'[]'
:
log
.
info
(
"没有高管"
)
log
.
info
(
"没有高管"
)
pass
pass
...
@@ -265,14 +264,14 @@ def savePeopleInfo(info, start):
...
@@ -265,14 +264,14 @@ def savePeopleInfo(info, start):
if
(
retJson
[
'success'
]
or
retJson
[
'success'
]
==
'true'
):
if
(
retJson
[
'success'
]
or
retJson
[
'success'
]
==
'true'
):
pass
pass
else
:
else
:
log
.
error
(
"保存高管接口失败---{retJson}"
)
log
.
error
(
f
"保存高管接口失败---{retJson}"
)
exception
=
'保存高管接口失败'
exception
=
'保存高管接口失败'
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
dic_json
[
'socialCreditCode'
],
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
dic_json
[
'socialCreditCode'
],
taskType
,
state
,
takeTime
,
''
,
exception
)
return
state
return
state
else
:
else
:
log
.
error
(
"保存高管接口失败---{response.status_code}"
)
log
.
error
(
f
"保存高管接口失败---{response.status_code}"
)
exception
=
'保存高管接口失败'
exception
=
'保存高管接口失败'
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
...
@@ -288,6 +287,7 @@ def savePeopleInfo(info, start):
...
@@ -288,6 +287,7 @@ def savePeopleInfo(info, start):
def
beginWork
():
def
beginWork
():
while
True
:
while
True
:
social_code
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gwqy_socialCode'
)
social_code
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gwqy_socialCode'
)
# social_code = 'ZZSN230824151229535'
if
not
social_code
:
if
not
social_code
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
@@ -297,7 +297,7 @@ def beginWork():
...
@@ -297,7 +297,7 @@ def beginWork():
# 数据库中获取基本信息
# 数据库中获取基本信息
data
=
baseCore
.
getInfomation
(
social_code
)
data
=
baseCore
.
getInfomation
(
social_code
)
enname
=
data
[
5
]
enname
=
data
[
5
]
gpdm
=
'0123'
gpdm
=
data
[
3
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
# 获取该企业对应项目的采集次数
# 获取该企业对应项目的采集次数
...
@@ -305,7 +305,7 @@ def beginWork():
...
@@ -305,7 +305,7 @@ def beginWork():
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 股票代码为空跳过
# 股票代码为空跳过
if
gpdm
==
''
:
if
gpdm
==
''
:
info
=
{
"base_info"
:
{
'公司名称'
:
enname
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
}}
info
=
{
"base_info"
:
{
'公司名称'
:
enname
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
}}
log
.
error
(
f
'{xydm}....股票代码为空'
)
log
.
error
(
f
'{xydm}....股票代码为空'
)
try
:
try
:
saveBaseInfo
(
info
,
start_time
)
saveBaseInfo
(
info
,
start_time
)
...
@@ -323,7 +323,7 @@ def beginWork():
...
@@ -323,7 +323,7 @@ def beginWork():
# 企业基本信息入库
# 企业基本信息入库
try
:
try
:
saveBaseInfo
(
retData
[
1
],
start_time
)
saveBaseInfo
(
retData
[
1
],
start_time
)
time
.
sleep
(
1
)
except
:
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
exception
=
'Kafka操作失败'
...
@@ -332,6 +332,7 @@ def beginWork():
...
@@ -332,6 +332,7 @@ def beginWork():
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
# 企业高管信息入库
# 企业高管信息入库
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
time
.
sleep
(
1
)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if
state
==
1
:
if
state
==
1
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
...
@@ -342,16 +343,26 @@ def beginWork():
...
@@ -342,16 +343,26 @@ def beginWork():
pass
pass
except
Exception
as
e
:
except
Exception
as
e
:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
# 若出现尚未发现的错误,则保存错误信息以及出错位置
info
=
{
"base_info"
:
{
'公司名称'
:
enname
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
}}
try
:
saveBaseInfo
(
info
,
start_time
)
log
.
info
(
f
'{enname}.....股票代码出错只保存基本信息'
)
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
ee
=
e
.
__traceback__
.
tb_lineno
ee
=
e
.
__traceback__
.
tb_lineno
log
.
error
(
f
'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
log
.
error
(
f
'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
# 企业数据采集完成,采集次数加一
# 企业数据采集完成,采集次数加一
count
+=
1
count
+=
1
runType
=
'BaseInfoRunCount'
runType
=
'BaseInfoRunCount'
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
# 释放资源
# 释放资源
baseCore
.
close
()
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论