Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
a9946e9a
提交
a9946e9a
authored
8月 09, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
国外企业基本信息-高管信息-企业动态
上级
8f9f0213
全部展开
显示空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
264 行增加
和
40 行删除
+264
-40
tyc_qydt.py
comData/tcyQydt/tyc_qydt.py
+1
-1
tyc_qydt_add.py
comData/tcyQydt/tyc_qydt_add.py
+0
-0
NewsYahoo.py
comData/yhcj/NewsYahoo.py
+164
-0
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+7
-6
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+92
-33
没有找到文件。
comData/tcyQydt/tyc_qydt.py
浏览文件 @
a9946e9a
...
@@ -41,7 +41,7 @@ def beinWork(tyc_code,social_code):
...
@@ -41,7 +41,7 @@ def beinWork(tyc_code,social_code):
# time.sleep(random.randint(3, 5))
# time.sleep(random.randint(3, 5))
break
break
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
"request请求异常----m-----{e}"
)
log
.
error
(
f
"request请求异常----m-----{e}"
)
pass
pass
if
(
response
.
status_code
==
200
):
if
(
response
.
status_code
==
200
):
...
...
comData/tcyQydt/tyc_qydt_add.py
0 → 100644
浏览文件 @
a9946e9a
差异被折叠。
点击展开。
comData/yhcj/NewsYahoo.py
0 → 100644
浏览文件 @
a9946e9a
# 雅虎财经企业动态获取
import
time
import
pandas
as
pd
import
pymysql
import
requests
from
bs4
import
BeautifulSoup
from
selenium.webdriver.common.by
import
By
from
selenium
import
webdriver
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
#获取资讯详情
def
getZx
(
xydm
,
url
,
title
,
cnx
):
start_time_content
=
time
.
time
()
try
:
chrome_options_content
=
webdriver
.
ChromeOptions
()
chrome_options_content
.
add_argument
(
'--disable-gpu'
)
chrome_options_content
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options_content
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options_content
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options_content
.
add_argument
(
"--start-maximized"
)
prefs_content
=
{
'profile.managed_default_content_settings.images'
:
2
}
chrome_options_content
.
add_experimental_option
(
'prefs'
,
prefs_content
)
chrome_options_content
.
add_argument
(
'--headless'
)
executable_path
=
r'D:\chrome\chromedriver.exe'
driverContent
=
webdriver
.
Chrome
(
options
=
chrome_options_content
,
executable_path
=
executable_path
)
driverContent
.
get
(
url
)
try
:
clickButton
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"collapse-button"
)
clickButton
.
click
()
except
Exception
as
e
:
pass
time
.
sleep
(
0.5
)
authorElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-author-byline-collapse"
)
timeElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-attr-time-style"
)
.
find_element
(
By
.
TAG_NAME
,
"time"
)
contentElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-body"
)
author
=
authorElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
pub_time
=
timeElement
.
get_attribute
(
"datetime"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
.
replace
(
"T"
,
" "
)
pub_time
=
pub_time
[
0
:
19
]
content
=
contentElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
driverContent
.
close
()
# 动态信息列表
list_info
=
[
xydm
,
title
,
''
,
content
,
pub_time
,
url
,
'雅虎财经'
,
author
,
'2'
,
'zh'
]
with
cnx
.
cursor
()
as
cursor
:
try
:
insert_sql
=
'''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
cursor
.
execute
(
insert_sql
,
tuple
(
list_info
))
cnx
.
commit
()
except
Exception
as
e1
:
log
.
error
(
"保存数据库失败"
)
log
.
info
(
f
"文章耗时,耗时{baseCore.getTimeCost(start_time_content,time.time())}"
)
except
Exception
as
e
:
log
.
error
(
"获取正文失败"
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
add_argument
(
"--start-maximized"
)
prefs
=
{
'profile.managed_default_content_settings.images'
:
2
}
chrome_options
.
add_experimental_option
(
'prefs'
,
prefs
)
chrome_options
.
add_argument
(
'--headless'
)
executable_path
=
r'D:\chrome\chromedriver.exe'
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
executable_path
=
executable_path
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
def
scroll
(
driver
):
for
i
in
range
(
0
,
30
):
#js = "window.scrollTo(0,document.body.scrollHeight)"
js
=
"var q=document.documentElement.scrollTop=100000"
driver
.
execute_script
(
js
)
time
.
sleep
(
0.1
)
# #读取excel数据
# df_all = pd.read_excel(r'./../data/2023年500强新上榜名单.xlsx', sheet_name='500强23年国外', keep_default_na=False)
# for num in range(len(df_all)):
# start_time = time.time()
# # country = df_all['国别'][num]
# # if(country!='国外'):
# # continue
# enname=df_all['英文名称'][num]
# gpdm = df_all['股票票代码'][num]
# xydm = df_all['信用代码'][num]
# if(gpdm==''):
# log.error(f"{num}--{gpdm}--股票代码为空 跳过")
# continue
# if (xydm == ''):
# log.error(f"{num}--{gpdm}--信用代码为空 跳过")
# continue
# count = int(df_all['企业动态数量(7.15)'][num])
# # if(count>0):
# # log.error(f"{num}--{gpdm}--动态大于0 跳过")
# # continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
def
news
(
num
,
gpdm
,
xydm
):
start_time
=
time
.
time
()
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
scroll
(
driver
)
# if True:
# continue
try
:
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
except
Exception
as
e
:
log
.
error
(
f
"{num}--{gpdm}--没找到新闻元素"
)
return
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
log
.
info
(
f
"{num}--{gpdm}--{len(news_lis)}条信息"
)
for
i
in
range
(
0
,
len
(
news_lis
)):
try
:
a_ele
=
news_lis
[
i
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[2]/h3[1]/a"
)
except
Exception
:
log
.
error
(
f
"{num}--{gpdm}--{i}----a标签没找到"
)
continue
news_url
=
a_ele
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
if
(
news_url
.
startswith
(
"https://finance.yahoo.com"
)):
pass
else
:
continue
#判断url是否已经存在
with
cnx
.
cursor
()
as
cursor
:
sel_sql
=
'''select social_credit_code from brpa_source_article where source_address =
%
s and social_credit_code=
%
s '''
cursor
.
execute
(
sel_sql
,
(
news_url
,
xydm
))
selects
=
cursor
.
fetchall
()
if
selects
:
log
.
error
(
f
"{num}--{gpdm}--网址已经存在----{news_url}"
)
continue
title
=
a_ele
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
getZx
(
xydm
,
news_url
,
title
,
cnx
)
log
.
info
(
f
"{num}--{gpdm}--{i}----{news_url}----------{news_url}"
)
log
.
info
(
f
"{num}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time,time.time())}"
)
#释放资源
baseCore
.
close
()
\ No newline at end of file
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
a9946e9a
# 雅虎财
经企业动态获取
# 雅虎财
经企业动态获取
...
@@ -100,12 +100,12 @@ def scroll(driver):
...
@@ -100,12 +100,12 @@ def scroll(driver):
#读取excel数据
#读取excel数据
df_all
=
pd
.
read_excel
(
r'.
\data\国外企业.xlsx'
,
sheet_name
=
0
,
keep_default_na
=
False
)
df_all
=
pd
.
read_excel
(
r'.
/../data/2023年500强新上榜名单.xlsx'
,
sheet_name
=
'500强23年国外'
,
keep_default_na
=
False
)
for
num
in
range
(
718
,
len
(
df_all
)):
for
num
in
range
(
len
(
df_all
)):
start_time
=
time
.
time
()
start_time
=
time
.
time
()
country
=
df_all
[
'国别'
][
num
]
#
country = df_all['国别'][num]
if
(
country
!=
'国外'
):
#
if(country!='国外'):
continue
#
continue
enname
=
df_all
[
'英文名称'
][
num
]
enname
=
df_all
[
'英文名称'
][
num
]
gpdm
=
df_all
[
'股票票代码'
][
num
]
gpdm
=
df_all
[
'股票票代码'
][
num
]
xydm
=
df_all
[
'信用代码'
][
num
]
xydm
=
df_all
[
'信用代码'
][
num
]
...
@@ -121,6 +121,7 @@ for num in range(718,len(df_all)):
...
@@ -121,6 +121,7 @@ for num in range(718,len(df_all)):
# continue
# continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
# def news(i,gpdm):
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
driver
.
get
(
url
)
scroll
(
driver
)
scroll
(
driver
)
...
...
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
a9946e9a
impor
t
json
impor
t
json
...
@@ -5,11 +5,15 @@ import pandas as pd
...
@@ -5,11 +5,15 @@ import pandas as pd
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
NewsYahoo
import
news
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
B
aseCore
.
getLogger
()
log
=
b
aseCore
.
getLogger
()
headers
=
{
headers
=
{
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-encoding'
:
'gzip, deflate, br'
,
...
@@ -185,29 +189,54 @@ def getInfo(name,gpdm,xydm):
...
@@ -185,29 +189,54 @@ def getInfo(name,gpdm,xydm):
}
}
retPeople
.
append
(
dic_main_people
)
retPeople
.
append
(
dic_main_people
)
retData
[
'people_info'
]
=
retPeople
retData
[
'people_info'
]
=
retPeople
df_retData
=
pd
.
DataFrame
(
retPeople
)
# df_a = pd.DataFrame(retData['base_info'])
df_retData
.
to_excel
(
'采集高管结果1.xlsx'
,
index
=
False
)
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
retData
return
retData
#保存基本信息
def
Nongpdm
(
xydm
,
name
,
officialUrl
,
industry
,
englishName
,
address
):
def
saveBaseInfo
(
info
):
start
=
time
.
time
()
start
=
time
.
time
()
#基本信息发送到kafka
company_dict
=
{
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
]
,
# 企业名称
'name'
:
name
,
# 企业名称
'shortName'
:
info
[
'base_info'
][
'公司名称'
]
,
# 企业简称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
]
,
# 统一社会信用代码
'socialCreditCode'
:
xydm
,
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
]
,
# 电话
'officialPhone'
:
''
,
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
]
,
# 官网
'officialUrl'
:
officialUrl
,
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
]
,
# 简介
'briefInfo'
:
''
,
# 简介
'industry'
:
in
fo
[
'base_info'
][
'行业'
]
,
# 所属行业
'industry'
:
in
dustry
,
# 所属行业
'englishName'
:
info
[
'base_info'
][
'公司名称'
]
,
# 英文名
'englishName'
:
englishName
,
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
]
,
# 地址
'address'
:
address
,
# 地址
'status'
:
0
,
# 状态
'status'
:
0
,
# 状态
}
}
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_result
.
get
(
timeout
=
10
)
kafka_result
.
get
(
timeout
=
10
)
log
.
info
(
f
"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}"
)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log
.
info
(
f
"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}"
)
return
company_dict
#保存基本信息
# def saveBaseInfo(info):
# start = time.time()
# #基本信息发送到kafka
# company_dict = {
# 'name': info['base_info']['公司名称'], # 企业名称
# 'shortName': info['base_info']['公司名称'], # 企业简称
# 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
# 'officialPhone': info['base_info']['电话'], # 电话
# 'officialUrl': info['base_info']['公司网站'], # 官网
# 'briefInfo': info['base_info']['公司简介'], # 简介
# 'industry': info['base_info']['行业'], # 所属行业
# 'englishName': info['base_info']['公司名称'], # 英文名
# 'address': info['base_info']['地址'], # 地址
# 'status': 0, # 状态
# }
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# # log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
#保存高管信息
#保存高管信息
def
savePeopleInfo
(
info
):
def
savePeopleInfo
(
info
):
...
@@ -269,43 +298,73 @@ def beginWork():
...
@@ -269,43 +298,73 @@ def beginWork():
#给定excel名单 保存股票代码
#给定excel名单 保存股票代码
okCount
=
0
okCount
=
0
errorCount
=
0
errorCount
=
0
df_all
=
pd
.
read_excel
(
'./data/96-22的500强企业清单.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
df_all_xydm
=
pd
.
read_excel
(
'../../data/工作簿1.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
for
i
in
range
(
300
,
len
(
df_all
)):
df_all
=
pd
.
read_excel
(
'../../data/23年500强企业新榜股票代码.xlsx'
,
dtype
=
str
,
keep_default_na
=
False
)
log
.
info
(
f
"{i}----------开始"
)
for
i
in
range
(
len
(
df_all_xydm
)):
country
=
df_all
[
'国内外'
][
i
]
# name = df_all['中文名称'][i]
if
country
==
'国外'
:
# rank = df_all['排名'][i]
# officialUrl = df_all['企业官网'][i]
# industry = df_all['行业'][i]
# englishName = df_all['英文名称'][i]
# address = df_all['企业总部地址'][i]
xydm_name
=
df_all_xydm
[
'名称'
][
i
]
# print(xydm_name)
for
j
in
range
(
len
(
df_all
)):
name
=
df_all
[
'中文名称'
][
j
]
if
name
==
xydm_name
:
print
(
name
,
xydm_name
)
xydm
=
df_all_xydm
[
'信用代码'
][
i
]
if
i
>=
22
:
pass
pass
else
:
else
:
log
.
info
(
f
"{i}----------为国内企业 跳过"
)
continue
continue
gpdm
=
df_all
[
'股票代码'
][
i
]
log
.
info
(
f
"{i}----------开始"
)
# country = df_all['企业所属国家'][i]
# if country=='中国':
# continue
# else:
# log.info(f"{i}----------为国外企业 继续")
gpdm
=
df_all
[
'股票代码'
][
j
]
#没有股票代码,就保存榜单中的数据
if
gpdm
==
''
:
if
gpdm
==
''
:
pass
else
:
log
.
info
(
f
"{i}----------为股票代码不为空 跳过"
)
continue
continue
enname
=
df_all
[
'英文名称'
][
i
]
# xydm = baseCore.getNextXydm()
# Nongpdm(xydm,name,officialUrl,industry,englishName,address)
else
:
log
.
info
(
f
"{j}----------为股票代码不为空 继续"
)
pass
enname
=
df_all
[
'英文名称'
][
j
]
if
enname
!=
''
:
if
enname
!=
''
:
pass
pass
else
:
else
:
log
.
info
(
f
"{i
}----------英文名字为空 跳过"
)
log
.
info
(
f
"{j
}----------英文名字为空 跳过"
)
continue
continue
log
.
info
(
f
"{i}----------开始股票代码"
)
# log.info(f"{i}----------开始股票代码")
gpdm
=
getGpdm
(
enname
)
# gpdm = getGpdm(enname)
# xydm=baseCore.getNextXydm()
retData
=
getInfo
(
enname
,
gpdm
,
xydm
)
# saveBaseInfo(retData)
savePeopleInfo
(
retData
)
#也可以去采集企业动态
news
(
j
,
gpdm
,
xydm
)
if
gpdm
!=
''
:
if
gpdm
!=
''
:
okCount
=
okCount
+
1
okCount
=
okCount
+
1
else
:
else
:
errorCount
=
errorCount
+
1
errorCount
=
errorCount
+
1
log
.
info
(
f
"{i
}-------成功{okCount}--失败-{errorCount}"
)
log
.
info
(
f
"{j
}-------成功{okCount}--失败-{errorCount}"
)
if
gpdm
==
''
:
if
gpdm
==
''
:
continue
continue
else
:
else
:
pass
pass
df_all
[
'股票代码'
][
i
]
=
gpdm
df_all
[
'股票代码'
][
j
]
=
gpdm
else
:
continue
if
(
i
%
10
==
0
):
if
(
i
%
10
==
0
):
df_all
.
to_excel
(
r'.
\data\96-22的500强企业清单_ret
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
.\..\data\23年500强企业新上榜_ret22
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
\data\96-22的500强企业清单_ret
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
df_all
.
to_excel
(
r'.
.\..\data\23年500强企业新榜_ret22
.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
header
=
True
)
# 释放资源
# 释放资源
baseCore
.
close
()
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论