提交 3646fd0b 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

import json import json
...@@ -11,18 +11,19 @@ from NewsYahoo import news ...@@ -11,18 +11,19 @@ from NewsYahoo import news
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') taskType = '企业基本信息/雅虎财经'
cursor = cnx.cursor()
baseCore = BaseCore() baseCore = BaseCore()
log= baseCore.getLogger() log = baseCore.getLogger()
headers = { headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br', 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0', 'cache-control': 'max-age=0',
#'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---', # 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"', 'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows", 'sec-ch-ua-platform': "Windows",
...@@ -33,42 +34,22 @@ headers = { ...@@ -33,42 +34,22 @@ headers = {
'upgrade-insecure-requests': '1', 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
} }
# 获取股票代码
# def getGpdm(name):
# start=time.time()
# gpdm=""
# try:
#
# url = f'https://query1.finance.yahoo.com/v1/finance/search?q={name}&lang=en-US&region=US&quotesCount=6&newsCount=2&listsCount=2&enableFuzzyQuery=false&quotesQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&enableEnhancedTrivialQuery=true&enableResearchReports=true&enableCulturalAssets=true&enableLogoUrl=true&researchReportsCount=2'
# response = requests.get(url, headers=headers, verify=False,timeout=(3.05, 3))
# time.sleep(3)
# except:
# return gpdm
# if (response.status_code == 200):
# pass
# else:
# log.error(f"{name}------获取股票接口返回失败:{response.status_code}")
# return gpdm
# retJson = json.loads(response.content.decode('utf-8'))
# try:
# gpdm =retJson['quotes'][0]['symbol']
# except:
# log.error(f"{name}---获取股票代码异常")
# return gpdm
# log.info(f"获取股票代码--{name},耗时{baseCore.getTimeCost(start, time.time())}")
#
# return gpdm
# 根据股票代码 获取企业基本信息 高管信息 # 根据股票代码 获取企业基本信息 高管信息
def getInfo(gpdm,xydm): def getInfo(name,enname,gpdm, xydm, start):
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm_ = str(gpdm)[1:]
else:
pass
else:
gpdm_ = gpdm gpdm_ = gpdm
if 'HK' in gpdm_: retData = {}
gpdm_ = gpdm_
start = time.time()
retData={}
retData['base_info'] = { retData['base_info'] = {
'公司名称': '', '公司名称': name,
'英文名':'', '英文名': enname,
'信用代码': xydm, '信用代码': xydm,
'股票代码': gpdm, '股票代码': gpdm,
'地址': '', '地址': '',
...@@ -79,12 +60,12 @@ def getInfo(gpdm,xydm): ...@@ -79,12 +60,12 @@ def getInfo(gpdm,xydm):
'员工人数': '', '员工人数': '',
'公司简介': '' '公司简介': ''
} }
retData['people_info']=[] retData['people_info'] = []
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE # https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url = f'https://finance.yahoo.com/quote/{gpdm}/profile?p={gpdm}' url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time.sleep(3) time.sleep(3)
for i in range(0,3): for i in range(0, 3):
try: try:
response = requests.get(url, headers=headers, verify=False) response = requests.get(url, headers=headers, verify=False)
time.sleep(1) time.sleep(1)
...@@ -92,14 +73,20 @@ def getInfo(gpdm,xydm): ...@@ -92,14 +73,20 @@ def getInfo(gpdm,xydm):
break break
else: else:
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}") log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except : except:
continue continue
if (response.status_code == 200): if (response.status_code == 200):
pass pass
else: else:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}") log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
return retData exeception = '获取基本信息接口返回失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
return [state,retData]
state = 1
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'}) page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
try: try:
...@@ -135,8 +122,8 @@ def getInfo(gpdm,xydm): ...@@ -135,8 +122,8 @@ def getInfo(gpdm,xydm):
except: except:
com_jianjie = '' com_jianjie = ''
dic_com_info = { dic_com_info = {
'公司名称':'', '公司名称': name,
'英文名':'', '英文名': enname,
'信用代码': xydm, '信用代码': xydm,
'股票代码': gpdm, '股票代码': gpdm,
'地址': com_address, '地址': com_address,
...@@ -147,8 +134,8 @@ def getInfo(gpdm,xydm): ...@@ -147,8 +134,8 @@ def getInfo(gpdm,xydm):
'员工人数': com_people, '员工人数': com_people,
'公司简介': com_jianjie '公司简介': com_jianjie
} }
retData['base_info']=dic_com_info retData['base_info'] = dic_com_info
#高管信息 # 高管信息
retPeople = [] retPeople = []
try: try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:] list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
...@@ -177,8 +164,8 @@ def getInfo(gpdm,xydm): ...@@ -177,8 +164,8 @@ def getInfo(gpdm,xydm):
except: except:
p_year = '' p_year = ''
if(p_zhiwu=="N/A"): if (p_zhiwu == "N/A"):
p_zhiwu="" p_zhiwu = ""
if (p_money == "N/A"): if (p_money == "N/A"):
p_money = "" p_money = ""
if (p_xingshi == "N/A"): if (p_xingshi == "N/A"):
...@@ -186,7 +173,7 @@ def getInfo(gpdm,xydm): ...@@ -186,7 +173,7 @@ def getInfo(gpdm,xydm):
if (p_year == "N/A"): if (p_year == "N/A"):
p_year = "" p_year = ""
dic_main_people = { dic_main_people = {
'公司名称': '', '公司名称': name,
'股票代码': gpdm, '股票代码': gpdm,
'信用代码': xydm, '信用代码': xydm,
'姓名': p_name, '姓名': p_name,
...@@ -197,41 +184,15 @@ def getInfo(gpdm,xydm): ...@@ -197,41 +184,15 @@ def getInfo(gpdm,xydm):
} }
retPeople.append(dic_main_people) retPeople.append(dic_main_people)
retData['people_info'] = retPeople retData['people_info'] = retPeople
df_retData = pd.DataFrame(retPeople)
# df_a = pd.DataFrame(retData['base_info'])
df_retData.to_excel('./data/采集高管结果1.xlsx',index=False)
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
return retData return [state,retData]
# def Nongpdm(xydm,name):
# start = time.time()
# company_dict = {
# 'name': name, # 企业名称
# 'shortName': '', # 企业简称
# 'socialCreditCode': xydm, # 统一社会信用代码
# 'officialPhone': '', # 电话
# 'officialUrl': '', # 官网
# 'briefInfo': '', # 简介
# 'industry': '', # 所属行业
# 'englishName': name, # 英文名
# 'address': '', # 地址
# 'status': 0, # 状态
# }
# # print(company_dict)
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
# kafka_result.get(timeout=10)
# # log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
# log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
# return company_dict
#
#保存基本信息 # 保存基本信息
def saveBaseInfo(info): def saveBaseInfo(info,start):
start = time.time() # 基本信息发送到kafka
#基本信息发送到kafka
company_dict = { company_dict = {
'name': '', # 企业名称 'name': info['base_info']['公司名称'], # 企业名称
'shortName': '', # 企业简称 'shortName': '', # 企业简称
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
'officialPhone': info['base_info']['电话'], # 电话 'officialPhone': info['base_info']['电话'], # 电话
...@@ -249,13 +210,13 @@ def saveBaseInfo(info): ...@@ -249,13 +210,13 @@ def saveBaseInfo(info):
log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
#保存高管信息
def savePeopleInfo(info): # 保存高管信息
start = time.time() def savePeopleInfo(info,start):
# 高管信息调用接口 # 高管信息调用接口
list_people = info['people_info'] list_people = info['people_info']
list_one_info = [] list_one_info = []
for i in range(0,len(list_people)): for i in range(0, len(list_people)):
dic_json = { dic_json = {
"socialCreditCode": list_people[i]['信用代码'], "socialCreditCode": list_people[i]['信用代码'],
"name": list_people[i]['姓名'], "name": list_people[i]['姓名'],
...@@ -269,7 +230,7 @@ def savePeopleInfo(info): ...@@ -269,7 +230,7 @@ def savePeopleInfo(info):
"benefitShare": '', "benefitShare": '',
"currentTerm": '', "currentTerm": '',
"personInfo": '', "personInfo": '',
"sort": str(i+1) "sort": str(i + 1)
} }
list_one_info.append(dic_json) list_one_info.append(dic_json)
json_updata = json.dumps(list_one_info) json_updata = json.dumps(list_one_info)
...@@ -277,11 +238,12 @@ def savePeopleInfo(info): ...@@ -277,11 +238,12 @@ def savePeopleInfo(info):
if json_updata == '[]': if json_updata == '[]':
pass pass
else: else:
for i in range(0,3): for i in range(0, 3):
response = requests.post('http://114.115.236.206:9988/datapull/sync/executive',data=json_updata,timeout=300, verify=False) response = requests.post('http://114.115.236.206:9988/datapull/sync/executive', data=json_updata,
timeout=300, verify=False)
if (response.status_code == 200): if (response.status_code == 200):
retJson = json.loads(response.content.decode('utf-8')) retJson = json.loads(response.content.decode('utf-8'))
if(retJson['success'] or retJson['success']=='true'): if (retJson['success'] or retJson['success'] == 'true'):
break break
if (response.status_code == 200): if (response.status_code == 200):
...@@ -290,123 +252,91 @@ def savePeopleInfo(info): ...@@ -290,123 +252,91 @@ def savePeopleInfo(info):
pass pass
else: else:
log.error("保存高管接口失败---{retJson}") log.error("保存高管接口失败---{retJson}")
exception = '保存高管接口失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception)
return state
else: else:
log.error("保存高管接口失败---{response.status_code}") log.error("保存高管接口失败---{response.status_code}")
exception = '保存高管接口失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception)
return state
state = 1
log.info(f"保存高管信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"保存高管信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
return state
#根据名字获取股票代码 必须是英文名字 如果提供的数据有股票代码 则跳过此步骤
#gpdm=getGpdm("Volkswagen")
#生成一个新的信用代码 如果提供的原始数据有信用代码 则不能生成新的信用代码
#xydm=baseCore.getNextXydm()
# xydm='ZZSN230710201009006'
# retData=getInfo("Volkswagen","VOW3.DE",xydm)
# saveBaseInfo(retData)
# savePeopleInfo(retData)
# print(retData)
#采集工作 # 采集工作
def beginWork(): def beginWork():
#给定excel名单 保存股票代码 while True:
okCount=0 social_code = baseCore.redicPullData('')
errorCount=0 if not social_code:
df_all = pd.read_excel('./data/福布斯美国企业股票代码.xlsx',dtype=str,keep_default_na=False) time.sleep(20)
a = [] continue
b = [] if social_code == 'None':
# df_all = pd.read_excel('../../data/23年500强企业新榜股票代码.xlsx',dtype=str, keep_default_na=False) time.sleep(20)
for i in range(len(df_all)): continue
# # name = df_all['中文名称'][i] # 数据库中获取基本信息
# # rank = df_all['排名'][i] data = baseCore.getInfomation(social_code)
# # officialUrl = df_all['企业官网'][i] name = data[1]
# # industry = df_all['行业'][i] enname = data[5]
# # englishName = df_all['英文名称'][i] gpdm = data[3]
# # address = df_all['企业总部地址'][i] xydm = data[2]
#
# xydm_name = df_all_xydm['名称'][i] # 获取该企业对应项目的采集次数
# # print(xydm_name) count = data[13]
# for j in range(len(df_all)): start_time = time.time()
# name = df_all['中文名称'][j] # 股票代码为空跳过
# if name == xydm_name: if (gpdm == ''):
# print(name,xydm_name) log.error(f"{name}--股票代码为空 跳过")
# xydm = df_all_xydm['信用代码'][i] exception = '股票代码为空'
# if i>=22: state = 0
# pass takeTime = baseCore.getTimeCost(start_time, time.time())
# else: baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
# continue continue
# log.info(f"{i}----------开始")
# # country = df_all['企业所属国家'][i]
# # if country=='中国':
# # continue
# # else:
# # log.info(f"{i}----------为国外企业 继续")
name = df_all['name'][i]
ename = df_all['companyName'][i]
gpdm= df_all['code'][i]
xydm = df_all['xydm'][i]
# if gpdm == '':
# Nongpdm(xydm,name)
# continue
# # print(f'名字:{name},股票代码:{gpdm},信用代码:{xydm}')
# # #没有股票代码,就保存榜单中的数据
# if gpdm == '':
# Nongpdm(gpdm,name)
# continue
# # xydm = baseCore.getNextXydm()
# # # Nongpdm(xydm,name,officialUrl,industry,englishName,address)
# # else:
# # log.info(f"{j}----------为股票代码不为空 继续")
# # pass
# # enname = df_all['英文名称'][j]
# # if enname != '':
# # pass
# # else:
# # log.info(f"{j}----------英文名字为空 跳过")
# # continue
# # # log.info(f"{i}----------开始股票代码")
# # # gpdm = getGpdm(enname)
# # # xydm=baseCore.getNextXydm()
# #
try: try:
retData = getInfo(name,ename,gpdm,xydm) retData = getInfo(name,enname,gpdm, xydm, start_time)
saveBaseInfo(retData) # 基本信息采集成功 进行数据入库,否则不入库
savePeopleInfo(retData) if retData[0] == 1:
#采集企业动态 # 企业基本信息入库
news(i,gpdm,xydm) try:
okCount += 1 saveBaseInfo(retData[1],start_time)
b.append([name,gpdm,xydm])
print(name,'.......采集成功')
except: except:
a.append([name,gpdm,xydm]) log.error(f'{name}....企业基本信息Kafka操作失败')
errorCount += 1 exception = 'Kafka操作失败'
print(name,'......采集失败') state = 0
print(f'成功{okCount}个,失败{errorCount}个') takeTime = baseCore.getTimeCost(start_time, time.time())
df_a = pd.DataFrame(np.array(a)) baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
df_a.columns = ['name','gpdm','xydm'] # 企业高管信息入库
df_a.to_excel('./data/没有采集到.xlsx',index=False) state = savePeopleInfo(retData[1],start_time)
df_b = pd.DataFrame(np.array(b)) # 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
df_b.columns = ['name','gpdm','xydm'] if state == 1:
df_b.to_excel('./data/采集到.xlsx',index=False) takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(xydm, taskType, state, takeTime, '', '')
# if gpdm!='': else:
# okCount=okCount+1 pass
# else: else:
# errorCount=errorCount+1 pass
# log.info(f"{i}-------成功{okCount}--失败-{errorCount}") except Exception as e:
# # if gpdm == '': # 若出现尚未发现的错误,则保存错误信息以及出错位置
# # continue ee = e.__traceback__.tb_lineno
# # else: log.error(f'{name}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}')
# # pass state = 0
# # df_all['股票代码'][j]=gpdm takeTime = baseCore.getTimeCost(start_time, time.time())
# # else: baseCore.recordLog(xydm, taskType, state, takeTime, '', f'数据采集失败,原因:{ee}行 {e}')
# # continue
# # if (i % 10 == 0): # 企业数据采集完成,采集次数加一
# # df_all.to_excel(r'./data/23年500强企业新上榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True) count += 1
# # df_all.to_excel(r'./data/23年500强企业新榜_ret22.xlsx', sheet_name='Sheet1', index=False, header=True) runType = 'BaseInfoRunCount'
# # 释放资源 baseCore.updateRun(social_code,runType,count)
# 释放资源
baseCore.close() baseCore.close()
if __name__ == '__main__': if __name__ == '__main__':
#gpdm = getGpdm("Volkswagen")
#print(gpdm)
beginWork() beginWork()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论