提交 a9946e9a 作者: 薛凌堃

国外企业基本信息-高管信息-企业动态

上级 8f9f0213
...@@ -41,7 +41,7 @@ def beinWork(tyc_code,social_code): ...@@ -41,7 +41,7 @@ def beinWork(tyc_code,social_code):
# time.sleep(random.randint(3, 5)) # time.sleep(random.randint(3, 5))
break break
except Exception as e : except Exception as e :
log.error("request请求异常----m-----{e}") log.error(f"request请求异常----m-----{e}")
pass pass
if (response.status_code == 200): if (response.status_code == 200):
......
# 雅虎财经企业动态获取
import time
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from base.BaseCore import BaseCore
baseCore = BaseCore()
log= baseCore.getLogger()
#获取资讯详情
def getZx(xydm,url,title,cnx):
start_time_content= time.time()
try:
chrome_options_content = webdriver.ChromeOptions()
chrome_options_content.add_argument('--disable-gpu')
chrome_options_content.add_argument('--ignore-certificate-errors')
chrome_options_content.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options_content.add_argument("--disable-blink-features=AutomationControlled")
chrome_options_content.add_argument("--start-maximized")
prefs_content = {'profile.managed_default_content_settings.images': 2}
chrome_options_content.add_experimental_option('prefs', prefs_content)
chrome_options_content.add_argument('--headless')
executable_path = r'D:\chrome\chromedriver.exe'
driverContent = webdriver.Chrome(options=chrome_options_content, executable_path=executable_path)
driverContent.get(url)
try:
clickButton = driverContent.find_element(By.CLASS_NAME,"collapse-button")
clickButton.click()
except Exception as e:
pass
time.sleep(0.5)
authorElement = driverContent.find_element(By.CLASS_NAME,"caas-author-byline-collapse")
timeElement = driverContent.find_element(By.CLASS_NAME,"caas-attr-time-style").find_element(By.TAG_NAME,"time")
contentElement = driverContent.find_element(By.CLASS_NAME,"caas-body")
author = authorElement.text.lstrip().strip().replace("'","''")
pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'","''").replace("T"," ")
pub_time = pub_time[0:19]
content = contentElement.text.lstrip().strip().replace("'","''")
driverContent.close()
# 动态信息列表
list_info = [
xydm,
title,
'',
content,
pub_time,
url,
'雅虎财经',
author,
'2',
'zh'
]
with cnx.cursor() as cursor:
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
except Exception as e1:
log.error("保存数据库失败")
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content,time.time())}")
except Exception as e:
log.error("获取正文失败")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
prefs = {'profile.managed_default_content_settings.images': 2}
chrome_options.add_experimental_option('prefs',prefs)
chrome_options.add_argument('--headless')
executable_path = r'D:\chrome\chromedriver.exe'
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
def scroll(driver):
for i in range(0,30):
#js = "window.scrollTo(0,document.body.scrollHeight)"
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
time.sleep(0.1)
# #读取excel数据
# df_all = pd.read_excel(r'./../data/2023年500强新上榜名单.xlsx', sheet_name='500强23年国外', keep_default_na=False)
# for num in range(len(df_all)):
# start_time = time.time()
# # country = df_all['国别'][num]
# # if(country!='国外'):
# # continue
# enname=df_all['英文名称'][num]
# gpdm = df_all['股票票代码'][num]
# xydm = df_all['信用代码'][num]
# if(gpdm==''):
# log.error(f"{num}--{gpdm}--股票代码为空 跳过")
# continue
# if (xydm == ''):
# log.error(f"{num}--{gpdm}--信用代码为空 跳过")
# continue
# count = int(df_all['企业动态数量(7.15)'][num])
# # if(count>0):
# # log.error(f"{num}--{gpdm}--动态大于0 跳过")
# # continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
def news(num,gpdm,xydm):
start_time = time.time()
url=f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url)
scroll(driver)
# if True:
# continue
try:
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
except Exception as e:
log.error(f"{num}--{gpdm}--没找到新闻元素")
return
news_lis = news_div.find_elements(By.XPATH,"./ul/li")
log.info(f"{num}--{gpdm}--{len(news_lis)}条信息")
for i in range(0,len(news_lis)):
try:
a_ele= news_lis[i].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a")
except Exception :
log.error(f"{num}--{gpdm}--{i}----a标签没找到")
continue
news_url = a_ele.get_attribute("href").lstrip().strip().replace("'","''")
if(news_url.startswith("https://finance.yahoo.com")):
pass
else:
continue
#判断url是否已经存在
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
cursor.execute(sel_sql, (news_url,xydm))
selects = cursor.fetchall()
if selects:
log.error(f"{num}--{gpdm}--网址已经存在----{news_url}")
continue
title = a_ele.text.lstrip().strip().replace("'","''")
getZx(xydm,news_url,title,cnx)
log.info(f"{num}--{gpdm}--{i}----{news_url}----------{news_url}")
log.info(f"{num}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time,time.time())}")
#释放资源
baseCore.close()
\ No newline at end of file
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
...@@ -100,12 +100,12 @@ def scroll(driver): ...@@ -100,12 +100,12 @@ def scroll(driver):
#读取excel数据 #读取excel数据
df_all = pd.read_excel(r'.\data\国外企业.xlsx', sheet_name=0, keep_default_na=False) df_all = pd.read_excel(r'./../data/2023年500强新上榜名单.xlsx', sheet_name='500强23年国外', keep_default_na=False)
for num in range(718,len(df_all)): for num in range(len(df_all)):
start_time = time.time() start_time = time.time()
country = df_all['国别'][num] # country = df_all['国别'][num]
if(country!='国外'): # if(country!='国外'):
continue # continue
enname=df_all['英文名称'][num] enname=df_all['英文名称'][num]
gpdm = df_all['股票票代码'][num] gpdm = df_all['股票票代码'][num]
xydm = df_all['信用代码'][num] xydm = df_all['信用代码'][num]
...@@ -121,6 +121,7 @@ for num in range(718,len(df_all)): ...@@ -121,6 +121,7 @@ for num in range(718,len(df_all)):
# continue # continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG #https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
# def news(i,gpdm):
url=f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}" url=f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url) driver.get(url)
scroll(driver) scroll(driver)
......
import json import json
...@@ -5,11 +5,15 @@ import pandas as pd ...@@ -5,11 +5,15 @@ import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from NewsYahoo import news
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore() baseCore = BaseCore()
log= BaseCore.getLogger() log= baseCore.getLogger()
headers = { headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br', 'accept-encoding': 'gzip, deflate, br',
...@@ -185,29 +189,54 @@ def getInfo(name,gpdm,xydm): ...@@ -185,29 +189,54 @@ def getInfo(name,gpdm,xydm):
} }
retPeople.append(dic_main_people) retPeople.append(dic_main_people)
retData['people_info'] = retPeople retData['people_info'] = retPeople
df_retData = pd.DataFrame(retPeople)
# df_a = pd.DataFrame(retData['base_info'])
df_retData.to_excel('采集高管结果1.xlsx',index=False)
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
return retData return retData
#保存基本信息 def Nongpdm(xydm,name,officialUrl,industry,englishName,address):
def saveBaseInfo(info):
start = time.time() start = time.time()
#基本信息发送到kafka
company_dict = { company_dict = {
'name': info['base_info']['公司名称'], # 企业名称 'name': name, # 企业名称
'shortName': info['base_info']['公司名称'], # 企业简称 'shortName': '', # 企业简称
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码 'socialCreditCode': xydm, # 统一社会信用代码
'officialPhone': info['base_info']['电话'], # 电话 'officialPhone': '', # 电话
'officialUrl': info['base_info']['公司网站'], # 官网 'officialUrl': officialUrl, # 官网
'briefInfo': info['base_info']['公司简介'], # 简介 'briefInfo': '', # 简介
'industry': info['base_info']['行业'], # 所属行业 'industry': industry, # 所属行业
'englishName': info['base_info']['公司名称'], # 英文名 'englishName': englishName, # 英文名
'address': info['base_info']['地址'], # 地址 'address': address, # 地址
'status': 0, # 状态 'status': 0, # 状态
} }
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2)) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
kafka_result.get(timeout=10) kafka_result.get(timeout=10)
log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}") # log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
return company_dict
#保存基本信息
# def saveBaseInfo(info):
# start = time.time()
# #基本信息发送到kafka
# company_dict = {
# 'name': info['base_info']['公司名称'], # 企业名称
# 'shortName': info['base_info']['公司名称'], # 企业简称
# 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
# 'officialPhone': info['base_info']['电话'], # 电话
# 'officialUrl': info['base_info']['公司网站'], # 官网
# 'briefInfo': info['base_info']['公司简介'], # 简介
# 'industry': info['base_info']['行业'], # 所属行业
# 'englishName': info['base_info']['公司名称'], # 英文名
# 'address': info['base_info']['地址'], # 地址
# 'status': 0, # 状态
# }
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# # log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
#保存高管信息 #保存高管信息
def savePeopleInfo(info): def savePeopleInfo(info):
...@@ -269,43 +298,73 @@ def beginWork(): ...@@ -269,43 +298,73 @@ def beginWork():
#给定excel名单 保存股票代码 #给定excel名单 保存股票代码
okCount=0 okCount=0
errorCount=0 errorCount=0
df_all = pd.read_excel('./data/96-22的500强企业清单.xlsx', dtype=str, keep_default_na=False) df_all_xydm = pd.read_excel('../../data/工作簿1.xlsx',dtype=str,keep_default_na=False)
for i in range(300, len(df_all)): df_all = pd.read_excel('../../data/23年500强企业新榜股票代码.xlsx',dtype=str, keep_default_na=False)
log.info(f"{i}----------开始") for i in range(len(df_all_xydm)):
country = df_all['国内外'][i] # name = df_all['中文名称'][i]
if country=='国外': # rank = df_all['排名'][i]
# officialUrl = df_all['企业官网'][i]
# industry = df_all['行业'][i]
# englishName = df_all['英文名称'][i]
# address = df_all['企业总部地址'][i]
xydm_name = df_all_xydm['名称'][i]
# print(xydm_name)
for j in range(len(df_all)):
name = df_all['中文名称'][j]
if name == xydm_name:
print(name,xydm_name)
xydm = df_all_xydm['信用代码'][i]
if i>=22:
pass pass
else: else:
log.info(f"{i}----------为国内企业 跳过")
continue continue
gpdm = df_all['股票代码'][i] log.info(f"{i}----------开始")
# country = df_all['企业所属国家'][i]
# if country=='中国':
# continue
# else:
# log.info(f"{i}----------为国外企业 继续")
gpdm = df_all['股票代码'][j]
#没有股票代码,就保存榜单中的数据
if gpdm == '': if gpdm == '':
pass
else:
log.info(f"{i}----------为股票代码不为空 跳过")
continue continue
enname = df_all['英文名称'][i] # xydm = baseCore.getNextXydm()
# Nongpdm(xydm,name,officialUrl,industry,englishName,address)
else:
log.info(f"{j}----------为股票代码不为空 继续")
pass
enname = df_all['英文名称'][j]
if enname != '': if enname != '':
pass pass
else: else:
log.info(f"{i}----------英文名字为空 跳过") log.info(f"{j}----------英文名字为空 跳过")
continue continue
log.info(f"{i}----------开始股票代码") # log.info(f"{i}----------开始股票代码")
gpdm = getGpdm(enname) # gpdm = getGpdm(enname)
# xydm=baseCore.getNextXydm()
retData = getInfo(enname,gpdm,xydm)
# saveBaseInfo(retData)
savePeopleInfo(retData)
#也可以去采集企业动态
news(j,gpdm,xydm)
if gpdm!='': if gpdm!='':
okCount=okCount+1 okCount=okCount+1
else: else:
errorCount=errorCount+1 errorCount=errorCount+1
log.info(f"{i}-------成功{okCount}--失败-{errorCount}") log.info(f"{j}-------成功{okCount}--失败-{errorCount}")
if gpdm == '': if gpdm == '':
continue continue
else: else:
pass pass
df_all['股票代码'][i]=gpdm df_all['股票代码'][j]=gpdm
else:
continue
if (i % 10 == 0): if (i % 10 == 0):
df_all.to_excel(r'.\data\96-22的500强企业清单_ret.xlsx', sheet_name='Sheet1', index=False, header=True) df_all.to_excel(r'..\..\data\23年500强企业新上榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True)
df_all.to_excel(r'.\data\96-22的500强企业清单_ret.xlsx', sheet_name='Sheet1', index=False, header=True) df_all.to_excel(r'..\..\data\23年500强企业新榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True)
# 释放资源 # 释放资源
baseCore.close() baseCore.close()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论