提交 4c6ee47a 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -15,6 +15,11 @@ from selenium.webdriver.chrome.service import Service ...@@ -15,6 +15,11 @@ from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook from openpyxl import Workbook
import langid import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
...@@ -233,6 +238,20 @@ class BaseCore: ...@@ -233,6 +238,20 @@ class BaseCore:
# 连接到Redis # 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6) self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='root',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self): def close(self):
try: try:
self.__cursor_proxy.close() self.__cursor_proxy.close()
...@@ -434,32 +453,66 @@ class BaseCore: ...@@ -434,32 +453,66 @@ class BaseCore:
# 根据社会信用代码获取企业信息 # 根据社会信用代码获取企业信息
def getInfomation(self, social_code): def getInfomation(self, social_code):
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'" data = []
self.cursor.execute(sql) try:
data = self.cursor.fetchone() sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data return data
# 更新企业采集次数 # 更新企业采集次数
def updateRun(self, social_code, runType, count): def updateRun(self, social_code, runType, count):
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'" try:
self.cursor.execute(sql_update) sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
self.cnx.commit() # self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库 # 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e): def recordLog(self, xydm, taskType, state, takeTime, url, e):
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
try: try:
self.cursor.execute(sql, values) createTime = self.getNowTime(1)
except Exception as e: ip = self.getIP()
print(e) pid = self.getPID()
self.cnx.commit() sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token #获取企查查token
def GetToken(self): def GetToken(self):
#获取企查查token #获取企查查token
query = "select token from QCC_token " query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改 # token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
...@@ -491,8 +544,8 @@ class BaseCore: ...@@ -491,8 +544,8 @@ class BaseCore:
# return combined_data # return combined_data
#对失败或者断掉的企业 重新放入redis #对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,item): def rePutIntoR(self,key,item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item) self.r.rpush(key, item)
#增加计数器的值并返回增加后的值 #增加计数器的值并返回增加后的值
def incrSet(self,key): def incrSet(self,key):
...@@ -518,3 +571,10 @@ class BaseCore: ...@@ -518,3 +571,10 @@ class BaseCore:
...@@ -170,6 +170,8 @@ def BaseInfoEnterprise_task(): ...@@ -170,6 +170,8 @@ def BaseInfoEnterprise_task():
print('定时采集异常', e) print('定时采集异常', e)
pass pass
#企业核心人员
#东方财富网财务数据 #东方财富网财务数据
def FinanceFromEast(): def FinanceFromEast():
cnx_,cursor_ = cnn11() cnx_,cursor_ = cnn11()
...@@ -183,6 +185,7 @@ def FinanceFromEast(): ...@@ -183,6 +185,7 @@ def FinanceFromEast():
r.rpush('FinanceFromEast:finance_socialCode', item) r.rpush('FinanceFromEast:finance_socialCode', item)
close11(cnx_,cursor_) close11(cnx_,cursor_)
#东方财富网财务数据定时任务
def FinanceFromEase_task(): def FinanceFromEase_task():
# 实例化一个调度器 # 实例化一个调度器
scheduler = BlockingScheduler() scheduler = BlockingScheduler()
...@@ -251,7 +254,7 @@ def AnnualEnterpriseXueQ_task(): ...@@ -251,7 +254,7 @@ def AnnualEnterpriseXueQ_task():
def BaseInfoEnterpriseAbroad(): def BaseInfoEnterpriseAbroad():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# 获取国外企业 # 获取国外企业
gn_query = "select id from EnterpriseInfo where Place = '2' limit 10 " gn_query = "select SocialCode from EnterpriseInfo where Place = '2' "
cursor.execute(gn_query) cursor.execute(gn_query)
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
...@@ -334,13 +337,13 @@ if __name__ == "__main__": ...@@ -334,13 +337,13 @@ if __name__ == "__main__":
# NoticeEnterprise() # NoticeEnterprise()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
# BaseInfoEnterpriseAbroad() BaseInfoEnterpriseAbroad()
NewsEnterprise_task() # NewsEnterprise_task()
# NewsEnterprise() # NewsEnterprise()
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
NoticeEnterprise_task() # NoticeEnterprise_task()
AnnualEnterprise_task() # AnnualEnterprise_task()
# NoticeEnterprise() # NoticeEnterprise()
# FinanceFromEast() # FinanceFromEast()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===') log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
......
"""
企业上市信息:只有上市的企业才能如企业库,未上市企业跳过采集步骤。退市企业标注为0
"""
import json
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from gpdm import Gpdm
baseCore = BaseCore()
chromedriver = "./chromedriver"
browser = webdriver.Chrome(chromedriver)
taskType = '上市信息/东方财富网'
gpdm = Gpdm()
gpdmList = gpdm.doJob()
log = baseCore.getLogger()
error_list = []
list_all_info = []
# 需要提供股票代码、企业信用代码
for com_code1 in gpdmList:
start = time.time()
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,8开头的为北京交易所
if com_code1[0] == '2' or com_code1[0] == '0' or com_code1[0] == '3':
com_code = 'sz' + com_code1
if com_code1[0] == '9' or com_code1[0] == '6':
com_code = 'sh' + com_code1
if com_code1[0] == '8' or com_code1[0] == '4':
com_code = 'bj' + com_code1
if com_code1[0] == 'A':
com_code = ''
log.info(f'======开始采集{com_code}======')
url = f'https://quote.eastmoney.com/{com_code}.html'
url_1 = f'https://emweb.eastmoney.com/PC_HSF10/CompanySurvey/PageAjax?code={com_code}'
url_2 = f'https://emweb.eastmoney.com/PC_HSF10/BusinessAnalysis/PageAjax?code={com_code}'
browser.get(url)
time.sleep(8)
page_source = browser.page_source
soup_t = BeautifulSoup(page_source, 'html.parser')
try:
result = soup_t.find('div',class_='quote_quotenums').text
# print(f'result:{result}')
# if result=='未上市'or result=='已退市':
if result == '未上市' :
continue
if result == '已退市':
tag = 0
else:
tag = 1
except Exception as e:
error_list.append(com_code)
log.info(f'={com_code}===解析上市状态失败=====')
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析上市状态失败--e:{e}')
print('error')
requests.adapters.DEFAULT_RETRIES = 5
json_1 = requests.get(url_1,verify=False).json()
json_2 = requests.get(url_2,verify=False).json()
# SECURITY_TYPE
try:
jys = json_1['jbzl'][0]['TRADE_MARKET']
except Exception as e:
log.info(f'====={com_code}=====解析交易所失败======')
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析交易所失败--e:{e}')
continue
try:
if "上海" in jys:
jys_code = '2'
if "深圳" in jys:
jys_code = '3'
except:
jys = json_1['jbzl'][0]['SECURITY_TYPE']
if "北京" in jys:
jys_code = '1'
short_name = json_1['jbzl'][0]['STR_NAMEA']
zhengquan_type = json_1['jbzl'][0]['SECURITY_TYPE']
# print(zhengquan_type)
if 'A' in zhengquan_type:
# print(zhengquan_type)
category = '1'
if 'B' in zhengquan_type:
category = '2'
if '新三板' in zhengquan_type:
category = '3'
if 'H' in zhengquan_type:
category = '4'
id_code = json_1['jbzl'][0]['REG_NUM']
dongcai = json_1['jbzl'][0]['EM2016']
zhengjian = json_1['jbzl'][0]['INDUSTRYCSRC1']
try:
shangshishijian = json_1['fxxg'][0]['LISTING_DATE'][:10]
except:
shangshishijian = ''
zhuyingfanwei = json_2['zyfw'][0]['BUSINESS_SCOPE']
dic_cwsj = {
"exchange": jys_code,
"category": category, # 股票类型(1-A股;2-B股;3-新三板;4-H股)
'listed':tag,
"listingDate": shangshishijian,
"securitiesCode": com_code[2:],
"securitiesShortName": short_name,
"securitiesType": zhengquan_type,
"socialCreditCode": id_code,
"businessScope": zhuyingfanwei,
"eastIndustry": dongcai,
"csrcIndustry": zhengjian
}
list_all_info.append(dic_cwsj)
log.info(f'======{com_code}====采集成功=====')
# 通过接口将数据保存进数据库
for num in range(0, len(list_all_info),100):
json_updata = json.dumps(list_all_info[num:num+100])
# print(json_updata)
try:
response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
verify=False)
except Exception as e:
print(e)
print("{}:到:{}".format(num, num + 100))
print(response.text)
import json import json
...@@ -91,6 +91,7 @@ def getInfo(name,enname,gpdm, xydm, start): ...@@ -91,6 +91,7 @@ def getInfo(name,enname,gpdm, xydm, start):
state = 1 state = 1
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'}) page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3',{'class':'Fz(m) Mb(10px)'}).text
try: try:
com_info = page.find('div', {'class': 'Mb(25px)'}) com_info = page.find('div', {'class': 'Mb(25px)'})
except: except:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论