提交 1de3a9f5 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -162,12 +162,12 @@ def scroll(xydm,name,gpdm): ...@@ -162,12 +162,12 @@ def scroll(xydm,name,gpdm):
log.error(f"{name}--{gpdm}--获取不到最后一条链接") log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break break
# todo:增量时 需打开注释 # todo:增量时 需打开注释
# try: try:
# selects = selectUrl(last_url_,xydm) selects = selectUrl(last_url_,xydm)
# except: except:
# break break
# if selects: if selects:
# break break
if last_url_ == last_url: if last_url_ == last_url:
break break
last_url_ = last_url last_url_ = last_url
...@@ -178,7 +178,7 @@ def rePutIntoR(item): ...@@ -178,7 +178,7 @@ def rePutIntoR(item):
if __name__ == "__main__": if __name__ == "__main__":
path = r'D:\chrome\chromedriver.exe' path = r'F:\spider\1\chromedriver.exe'
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
...@@ -186,7 +186,7 @@ if __name__ == "__main__": ...@@ -186,7 +186,7 @@ if __name__ == "__main__":
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterpriseFbs:gwqy_socialCode') social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
# social_code = 'ZZSN22080900000046' # social_code = 'ZZSN22080900000046'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
...@@ -207,6 +207,8 @@ if __name__ == "__main__": ...@@ -207,6 +207,8 @@ if __name__ == "__main__":
gpdm = str(gpdm)[1:] gpdm = str(gpdm)[1:]
else: else:
pass pass
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm = gpdm[:-2]
xydm = data[2] xydm = data[2]
# 获取该企业对应项目的采集次数 # 获取该企业对应项目的采集次数
...@@ -280,9 +282,9 @@ if __name__ == "__main__": ...@@ -280,9 +282,9 @@ if __name__ == "__main__":
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception) baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
# 增量使用 # 增量使用
# break break
# 全量使用 # 全量使用
continue # continue
title = a_ele.text.lstrip().strip().replace("'", "''") title = a_ele.text.lstrip().strip().replace("'", "''")
exception = getZx(xydm, news_url, title, cnx, path) exception = getZx(xydm, news_url, title, cnx, path)
if exception == '': if exception == '':
......
import json import json
...@@ -4,8 +4,9 @@ import requests ...@@ -4,8 +4,9 @@ import requests
import sys import sys
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
sys.path.append(r'F:\zzsn\zzsn_spider\base') # sys.path.append(r'F:\zzsn\zzsn_spider\base')
import BaseCore # import BaseCore
from base import BaseCore
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
...@@ -20,7 +21,6 @@ headers = { ...@@ -20,7 +21,6 @@ headers = {
'accept-encoding': 'gzip, deflate, br', 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0', 'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"', 'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows", 'sec-ch-ua-platform': "Windows",
...@@ -41,6 +41,8 @@ def getInfo(enname, gpdm, xydm, start): ...@@ -41,6 +41,8 @@ def getInfo(enname, gpdm, xydm, start):
gpdm_ = str(gpdm)[1:] gpdm_ = str(gpdm)[1:]
else: else:
pass pass
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm_ = gpdm[:-2]
else: else:
gpdm_ = gpdm gpdm_ = gpdm
retData = {} retData = {}
...@@ -50,7 +52,6 @@ def getInfo(enname, gpdm, xydm, start): ...@@ -50,7 +52,6 @@ def getInfo(enname, gpdm, xydm, start):
'信用代码': xydm, '信用代码': xydm,
} }
retData['people_info'] = [] retData['people_info'] = []
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}' url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time.sleep(3) time.sleep(3)
...@@ -78,7 +79,7 @@ def getInfo(enname, gpdm, xydm, start): ...@@ -78,7 +79,7 @@ def getInfo(enname, gpdm, xydm, start):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception) baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
baseCore.rePutIntoR('BaseInfoEnterprise:gwqy_socialCode', xydm) baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gwqy_social_code', xydm)
return [state, retData] return [state, retData]
except: except:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}") log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
...@@ -86,7 +87,7 @@ def getInfo(enname, gpdm, xydm, start): ...@@ -86,7 +87,7 @@ def getInfo(enname, gpdm, xydm, start):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception) baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
baseCore.rePutIntoR('BaseInfoEnterprise:gwqy_socialCode', xydm) baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gwqy_social_code', xydm)
return [state, retData] return [state, retData]
state = 1 state = 1
...@@ -216,7 +217,6 @@ def saveBaseInfo(info, start): ...@@ -216,7 +217,6 @@ def saveBaseInfo(info, start):
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码 'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
'englishName': info['base_info']['英文名'], # 英文名 'englishName': info['base_info']['英文名'], # 英文名
} }
# print(company_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2)) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
kafka_result.get(timeout=10) kafka_result.get(timeout=10)
...@@ -247,7 +247,6 @@ def savePeopleInfo(info, start): ...@@ -247,7 +247,6 @@ def savePeopleInfo(info, start):
} }
list_one_info.append(dic_json) list_one_info.append(dic_json)
json_updata = json.dumps(list_one_info) json_updata = json.dumps(list_one_info)
# print(json_updata)
if json_updata == '[]': if json_updata == '[]':
log.info("没有高管") log.info("没有高管")
pass pass
...@@ -265,14 +264,14 @@ def savePeopleInfo(info, start): ...@@ -265,14 +264,14 @@ def savePeopleInfo(info, start):
if (retJson['success'] or retJson['success'] == 'true'): if (retJson['success'] or retJson['success'] == 'true'):
pass pass
else: else:
log.error("保存高管接口失败---{retJson}") log.error(f"保存高管接口失败---{retJson}")
exception = '保存高管接口失败' exception = '保存高管接口失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception) baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception)
return state return state
else: else:
log.error("保存高管接口失败---{response.status_code}") log.error(f"保存高管接口失败---{response.status_code}")
exception = '保存高管接口失败' exception = '保存高管接口失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
...@@ -288,6 +287,7 @@ def savePeopleInfo(info, start): ...@@ -288,6 +287,7 @@ def savePeopleInfo(info, start):
def beginWork(): def beginWork():
while True: while True:
social_code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode') social_code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode')
# social_code = 'ZZSN230824151229535'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
...@@ -297,7 +297,7 @@ def beginWork(): ...@@ -297,7 +297,7 @@ def beginWork():
# 数据库中获取基本信息 # 数据库中获取基本信息
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
enname = data[5] enname = data[5]
gpdm = '0123' gpdm = data[3]
xydm = data[2] xydm = data[2]
# 获取该企业对应项目的采集次数 # 获取该企业对应项目的采集次数
...@@ -305,7 +305,7 @@ def beginWork(): ...@@ -305,7 +305,7 @@ def beginWork():
start_time = time.time() start_time = time.time()
# 股票代码为空跳过 # 股票代码为空跳过
if gpdm == '': if gpdm == '':
info = {"base_info": {'公司名称': enname,'英文名': enname,'信用代码': xydm, }} info = {"base_info": {'公司名称': enname, '英文名': enname, '信用代码': xydm, }}
log.error(f'{xydm}....股票代码为空') log.error(f'{xydm}....股票代码为空')
try: try:
saveBaseInfo(info, start_time) saveBaseInfo(info, start_time)
...@@ -323,7 +323,7 @@ def beginWork(): ...@@ -323,7 +323,7 @@ def beginWork():
# 企业基本信息入库 # 企业基本信息入库
try: try:
saveBaseInfo(retData[1], start_time) saveBaseInfo(retData[1], start_time)
time.sleep(1)
except: except:
log.error(f'{enname}....企业基本信息Kafka操作失败') log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败' exception = 'Kafka操作失败'
...@@ -332,6 +332,7 @@ def beginWork(): ...@@ -332,6 +332,7 @@ def beginWork():
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception) baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
# 企业高管信息入库 # 企业高管信息入库
state = savePeopleInfo(retData[1], start_time) state = savePeopleInfo(retData[1], start_time)
time.sleep(1)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功 # 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if state == 1: if state == 1:
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
...@@ -342,6 +343,16 @@ def beginWork(): ...@@ -342,6 +343,16 @@ def beginWork():
pass pass
except Exception as e: except Exception as e:
# 若出现尚未发现的错误,则保存错误信息以及出错位置 # 若出现尚未发现的错误,则保存错误信息以及出错位置
info = {"base_info": {'公司名称': enname,'英文名': enname,'信用代码': xydm, }}
try:
saveBaseInfo(info, start_time)
log.info(f'{enname}.....股票代码出错只保存基本信息')
except:
log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
ee = e.__traceback__.tb_lineno ee = e.__traceback__.tb_lineno
log.error(f'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}') log.error(f'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}')
state = 0 state = 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论