Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
a9946e9a
提交
a9946e9a
authored
8月 09, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
国外企业基本信息-高管信息-企业动态
上级
8f9f0213
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
281 行增加
和
57 行删除
+281
-57
tyc_qydt.py
comData/tcyQydt/tyc_qydt.py
+1
-1
tyc_qydt_add.py
comData/tcyQydt/tyc_qydt_add.py
+0
-0
NewsYahoo.py
comData/yhcj/NewsYahoo.py
+164
-0
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+7
-6
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+109
-50
没有找到文件。
comData/tcyQydt/tyc_qydt.py
浏览文件 @
a9946e9a
...
...
@@ -41,7 +41,7 @@ def beinWork(tyc_code,social_code):
# time.sleep(random.randint(3, 5))
break
except
Exception
as
e
:
log
.
error
(
"request请求异常----m-----{e}"
)
log
.
error
(
f
"request请求异常----m-----{e}"
)
pass
if
(
response
.
status_code
==
200
):
...
...
comData/tcyQydt/tyc_qydt_add.py
0 → 100644
浏览文件 @
a9946e9a
差异被折叠。
点击展开。
comData/yhcj/NewsYahoo.py
0 → 100644
浏览文件 @
a9946e9a
# 雅虎财经企业动态获取
import
time
import
pandas
as
pd
import
pymysql
import
requests
from
bs4
import
BeautifulSoup
from
selenium.webdriver.common.by
import
By
from
selenium
import
webdriver
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
#获取资讯详情
def
getZx
(
xydm
,
url
,
title
,
cnx
):
start_time_content
=
time
.
time
()
try
:
chrome_options_content
=
webdriver
.
ChromeOptions
()
chrome_options_content
.
add_argument
(
'--disable-gpu'
)
chrome_options_content
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options_content
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options_content
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options_content
.
add_argument
(
"--start-maximized"
)
prefs_content
=
{
'profile.managed_default_content_settings.images'
:
2
}
chrome_options_content
.
add_experimental_option
(
'prefs'
,
prefs_content
)
chrome_options_content
.
add_argument
(
'--headless'
)
executable_path
=
r'D:\chrome\chromedriver.exe'
driverContent
=
webdriver
.
Chrome
(
options
=
chrome_options_content
,
executable_path
=
executable_path
)
driverContent
.
get
(
url
)
try
:
clickButton
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"collapse-button"
)
clickButton
.
click
()
except
Exception
as
e
:
pass
time
.
sleep
(
0.5
)
authorElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-author-byline-collapse"
)
timeElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-attr-time-style"
)
.
find_element
(
By
.
TAG_NAME
,
"time"
)
contentElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-body"
)
author
=
authorElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
pub_time
=
timeElement
.
get_attribute
(
"datetime"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
.
replace
(
"T"
,
" "
)
pub_time
=
pub_time
[
0
:
19
]
content
=
contentElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
driverContent
.
close
()
# 动态信息列表
list_info
=
[
xydm
,
title
,
''
,
content
,
pub_time
,
url
,
'雅虎财经'
,
author
,
'2'
,
'zh'
]
with
cnx
.
cursor
()
as
cursor
:
try
:
insert_sql
=
'''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
cursor
.
execute
(
insert_sql
,
tuple
(
list_info
))
cnx
.
commit
()
except
Exception
as
e1
:
log
.
error
(
"保存数据库失败"
)
log
.
info
(
f
"文章耗时,耗时{baseCore.getTimeCost(start_time_content,time.time())}"
)
except
Exception
as
e
:
log
.
error
(
"获取正文失败"
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
add_argument
(
"--start-maximized"
)
prefs
=
{
'profile.managed_default_content_settings.images'
:
2
}
chrome_options
.
add_experimental_option
(
'prefs'
,
prefs
)
chrome_options
.
add_argument
(
'--headless'
)
executable_path
=
r'D:\chrome\chromedriver.exe'
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
executable_path
=
executable_path
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
def
scroll
(
driver
):
for
i
in
range
(
0
,
30
):
#js = "window.scrollTo(0,document.body.scrollHeight)"
js
=
"var q=document.documentElement.scrollTop=100000"
driver
.
execute_script
(
js
)
time
.
sleep
(
0.1
)
# #读取excel数据
# df_all = pd.read_excel(r'./../data/2023年500强新上榜名单.xlsx', sheet_name='500强23年国外', keep_default_na=False)
# for num in range(len(df_all)):
# start_time = time.time()
# # country = df_all['国别'][num]
# # if(country!='国外'):
# # continue
# enname=df_all['英文名称'][num]
# gpdm = df_all['股票票代码'][num]
# xydm = df_all['信用代码'][num]
# if(gpdm==''):
# log.error(f"{num}--{gpdm}--股票代码为空 跳过")
# continue
# if (xydm == ''):
# log.error(f"{num}--{gpdm}--信用代码为空 跳过")
# continue
# count = int(df_all['企业动态数量(7.15)'][num])
# # if(count>0):
# # log.error(f"{num}--{gpdm}--动态大于0 跳过")
# # continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
def
news
(
num
,
gpdm
,
xydm
):
start_time
=
time
.
time
()
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
scroll
(
driver
)
# if True:
# continue
try
:
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
except
Exception
as
e
:
log
.
error
(
f
"{num}--{gpdm}--没找到新闻元素"
)
return
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
log
.
info
(
f
"{num}--{gpdm}--{len(news_lis)}条信息"
)
for
i
in
range
(
0
,
len
(
news_lis
)):
try
:
a_ele
=
news_lis
[
i
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[2]/h3[1]/a"
)
except
Exception
:
log
.
error
(
f
"{num}--{gpdm}--{i}----a标签没找到"
)
continue
news_url
=
a_ele
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
if
(
news_url
.
startswith
(
"https://finance.yahoo.com"
)):
pass
else
:
continue
#判断url是否已经存在
with
cnx
.
cursor
()
as
cursor
:
sel_sql
=
'''select social_credit_code from brpa_source_article where source_address =
%
s and social_credit_code=
%
s '''
cursor
.
execute
(
sel_sql
,
(
news_url
,
xydm
))
selects
=
cursor
.
fetchall
()
if
selects
:
log
.
error
(
f
"{num}--{gpdm}--网址已经存在----{news_url}"
)
continue
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
getZx
(
xydm
,
news_url
,
title
,
cnx
)
log
.
info
(
f
"{num}--{gpdm}--{i}----{news_url}----------{news_url}"
)
log
.
info
(
f
"{num}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time,time.time())}"
)
#释放资源
baseCore
.
close
()
\ No newline at end of file
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
a9946e9a
# 雅虎财
经企业动态获取
# 雅虎财
经企业动态获取
...
...
@@ -100,12 +100,12 @@ def scroll(driver):
#读取excel数据
df_all
=
pd
.
read_excel
(
r'.
\data\国外企业.xlsx'
,
sheet_name
=
0
,
keep_default_na
=
False
)
for
num
in
range
(
718
,
len
(
df_all
)):
df_all
=
pd
.
read_excel
(
r'.
/../data/2023年500强新上榜名单.xlsx'
,
sheet_name
=
'500强23年国外'
,
keep_default_na
=
False
)
for
num
in
range
(
len
(
df_all
)):
start_time
=
time
.
time
()
country
=
df_all
[
'国别'
][
num
]
if
(
country
!=
'国外'
):
continue
#
country = df_all['国别'][num]
#
if(country!='国外'):
#
continue
enname
=
df_all
[
'英文名称'
][
num
]
gpdm
=
df_all
[
'股票票代码'
][
num
]
xydm
=
df_all
[
'信用代码'
][
num
]
...
...
@@ -121,6 +121,7 @@ for num in range(718,len(df_all)):
# continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
# def news(i,gpdm):
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
scroll
(
driver
)
...
...
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
a9946e9a
impor
t
json
impor
t
json
...
...
@@ -5,11 +5,15 @@ import pandas as pd
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
NewsYahoo
import
news
from
base.BaseCore
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
()
log
=
B
aseCore
.
getLogger
()
log
=
b
aseCore
.
getLogger
()
headers
=
{
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'accept-encoding'
:
'gzip, deflate, br'
,
...
...
@@ -185,29 +189,54 @@ def getInfo(name,gpdm,xydm):
}
retPeople
.
append
(
dic_main_people
)
retData
[
'people_info'
]
=
retPeople
df_retData
=
pd
.
DataFrame
(
retPeople
)
# df_a = pd.DataFrame(retData['base_info'])
df_retData
.
to_excel
(
'采集高管结果1.xlsx'
,
index
=
False
)
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
retData
#保存基本信息
def
saveBaseInfo
(
info
):
def
Nongpdm
(
xydm
,
name
,
officialUrl
,
industry
,
englishName
,
address
):
start
=
time
.
time
()
#基本信息发送到kafka
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
]
,
# 企业名称
'shortName'
:
info
[
'base_info'
][
'公司名称'
]
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
]
,
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
]
,
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
]
,
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
]
,
# 简介
'industry'
:
in
fo
[
'base_info'
][
'行业'
]
,
# 所属行业
'englishName'
:
info
[
'base_info'
][
'公司名称'
]
,
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
]
,
# 地址
'name'
:
name
,
# 企业名称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
xydm
,
# 统一社会信用代码
'officialPhone'
:
''
,
# 电话
'officialUrl'
:
officialUrl
,
# 官网
'briefInfo'
:
''
,
# 简介
'industry'
:
in
dustry
,
# 所属行业
'englishName'
:
englishName
,
# 英文名
'address'
:
address
,
# 地址
'status'
:
0
,
# 状态
}
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
.
get
(
timeout
=
10
)
log
.
info
(
f
"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}"
)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log
.
info
(
f
"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
company_dict
#保存基本信息
# def saveBaseInfo(info):
# start = time.time()
# #基本信息发送到kafka
# company_dict = {
# 'name': info['base_info']['公司名称'], # 企业名称
# 'shortName': info['base_info']['公司名称'], # 企业简称
# 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
# 'officialPhone': info['base_info']['电话'], # 电话
# 'officialUrl': info['base_info']['公司网站'], # 官网
# 'briefInfo': info['base_info']['公司简介'], # 简介
# 'industry': info['base_info']['行业'], # 所属行业
# 'englishName': info['base_info']['公司名称'], # 英文名
# 'address': info['base_info']['地址'], # 地址
# 'status': 0, # 状态
# }
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# # log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
#保存高管信息
def
savePeopleInfo
(
info
):
...
...
@@ -269,43 +298,73 @@ def beginWork():
#给定excel名单 保存股票代码
okCount
=
0
errorCount
=
0
df_all
=
pd
.
read_excel
(
'./data/96-22的500强企业清单.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
for
i
in
range
(
300
,
len
(
df_all
)):
log
.
info
(
f
"{i}----------开始"
)
country
=
df_all
[
'国内外'
][
i
]
if
country
==
'国外'
:
pass
else
:
log
.
info
(
f
"{i}----------为国内企业 跳过"
)
continue
gpdm
=
df_all
[
'股票代码'
][
i
]
if
gpdm
==
''
:
pass
else
:
log
.
info
(
f
"{i}----------为股票代码不为空 跳过"
)
continue
enname
=
df_all
[
'英文名称'
][
i
]
if
enname
!=
''
:
pass
else
:
log
.
info
(
f
"{i}----------英文名字为空 跳过"
)
continue
log
.
info
(
f
"{i}----------开始股票代码"
)
gpdm
=
getGpdm
(
enname
)
df_all_xydm
=
pd
.
read_excel
(
'../../data/工作簿1.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
df_all
=
pd
.
read_excel
(
'../../data/23年500强企业新榜股票代码.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
for
i
in
range
(
len
(
df_all_xydm
)):
# name = df_all['中文名称'][i]
# rank = df_all['排名'][i]
# officialUrl = df_all['企业官网'][i]
# industry = df_all['行业'][i]
# englishName = df_all['英文名称'][i]
# address = df_all['企业总部地址'][i]
if
gpdm
!=
''
:
okCount
=
okCount
+
1
else
:
errorCount
=
errorCount
+
1
log
.
info
(
f
"{i}-------成功{okCount}--失败-{errorCount}"
)
if
gpdm
==
''
:
continue
else
:
pass
df_all
[
'股票代码'
][
i
]
=
gpdm
xydm_name
=
df_all_xydm
[
'名称'
][
i
]
# print(xydm_name)
for
j
in
range
(
len
(
df_all
)):
name
=
df_all
[
'中文名称'
][
j
]
if
name
==
xydm_name
:
print
(
name
,
xydm_name
)
xydm
=
df_all_xydm
[
'信用代码'
][
i
]
if
i
>=
22
:
pass
else
:
continue
log
.
info
(
f
"{i}----------开始"
)
# country = df_all['企业所属国家'][i]
# if country=='中国':
# continue
# else:
# log.info(f"{i}----------为国外企业 继续")
gpdm
=
df_all
[
'股票代码'
][
j
]
#没有股票代码,就保存榜单中的数据
if
gpdm
==
''
:
continue
# xydm = baseCore.getNextXydm()
# Nongpdm(xydm,name,officialUrl,industry,englishName,address)
else
:
log
.
info
(
f
"{j}----------为股票代码不为空 继续"
)
pass
enname
=
df_all
[
'英文名称'
][
j
]
if
enname
!=
''
:
pass
else
:
log
.
info
(
f
"{j}----------英文名字为空 跳过"
)
continue
# log.info(f"{i}----------开始股票代码")
# gpdm = getGpdm(enname)
# xydm=baseCore.getNextXydm()
retData
=
getInfo
(
enname
,
gpdm
,
xydm
)
# saveBaseInfo(retData)
savePeopleInfo
(
retData
)
#也可以去采集企业动态
news
(
j
,
gpdm
,
xydm
)
if
gpdm
!=
''
:
okCount
=
okCount
+
1
else
:
errorCount
=
errorCount
+
1
log
.
info
(
f
"{j}-------成功{okCount}--失败-{errorCount}"
)
if
gpdm
==
''
:
continue
else
:
pass
df_all
[
'股票代码'
][
j
]
=
gpdm
else
:
continue
if
(
i
%
10
==
0
):
df_all
.
to_excel
(
r'.
\data\96-22的500强企业清单_ret
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
\data\96-22的500强企业清单_ret
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
.\..\data\23年500强企业新上榜_ret22
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
.\..\data\23年500强企业新榜_ret22
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
# 释放资源
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论