提交 b931eea2 作者: 薛凌堃

12/13

上级 1f595f59
...@@ -74,7 +74,7 @@ def NewsEnterprise(): ...@@ -74,7 +74,7 @@ def NewsEnterprise():
print('=======') print('=======')
#将数据插入到redis中 #将数据插入到redis中
for item in gn_social_list: for item in gn_social_list:
r.rpush('NewsEnterprise:gnqy_socialCode', item) r.rpush('NewsResend:newsInfo', item)
# r.rpush('NewsEnterprise:gnqybc_socialCode', item) # r.rpush('NewsEnterprise:gnqybc_socialCode', item)
# for item in gw_social_list: # for item in gw_social_list:
...@@ -126,30 +126,69 @@ def NoticeEnterprise_task(): ...@@ -126,30 +126,69 @@ def NoticeEnterprise_task():
def NoticeDF(): def NoticeDF():
cnx, cursor = connectSql() cnx, cursor = connectSql()
# 获取美股企业 # 获取美股企业
# # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'" om_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
# mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'" cursor.execute(om_mg_query)
# cursor.execute(mg_query) cnx.commit()
# cnx.commit() om_mg_result = cursor.fetchall()
# mg_result = cursor.fetchall() om_mg_social_list = [item[0] for item in om_mg_result]
# mg_social_list = [item[0] for item in mg_result] print('欧盟美股企业=======')
# print('=======') for item in om_mg_social_list:
# for item in mg_social_list: if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
# if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0: r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
# r.lpush('NoticeEnterprise:mgqy_socialCode_add', item) else:
# else: continue
# continue
# # r.rpush('NoticeEnterprise:mgqy_socialCode_add', item) fq_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
cursor.execute(fq_mg_query)
cnx.commit()
fq_mg_result = cursor.fetchall()
fq_mg_social_list = [item[0] for item in fq_mg_result]
print('500强美股企业=======')
for item in fq_mg_social_list:
if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
else:
continue
fbs_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=3 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
cursor.execute(fbs_mg_query)
cnx.commit()
fbs_mg_result = cursor.fetchall()
fbs_mg_social_list = [item[0] for item in fbs_mg_result]
print('福布斯美股企业=======')
for item in fbs_mg_social_list:
if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
else:
continue
# 获取港股企业 # 获取港股企业
gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 And SecuritiesCode like '%.HK'" om_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 And SecuritiesCode like '%.HK'"
cursor.execute(gg_query) cursor.execute(om_gg_query)
cnx.commit() cnx.commit()
gg_result = cursor.fetchall() om_gg_result = cursor.fetchall()
gg_social_list = [item[0] for item in gg_result] om_gg_social_list = [item[0] for item in om_gg_result]
print('=======') print('欧盟港股企业=======')
for item in gg_social_list: for item in om_gg_social_list:
r.rpush('NoticeEnterprise:ggqy_socialCode_add', item) r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
fq_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"
cursor.execute(fq_gg_query)
cnx.commit()
fq_gg_result = cursor.fetchall()
fq_gg_social_list = [item[0] for item in fq_gg_result]
print('500强港股企业=======')
for item in fq_gg_social_list:
r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
fbs_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"
cursor.execute(fbs_gg_query)
cnx.commit()
fbs_gg_result = cursor.fetchall()
fbs_gg_social_list = [item[0] for item in fbs_gg_result]
print('500强港股企业=======')
for item in fbs_gg_social_list:
r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
closeSql(cnx, cursor) closeSql(cnx, cursor)
...@@ -612,7 +651,7 @@ if __name__ == "__main__": ...@@ -612,7 +651,7 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# BaseInfoEnterpriseAbroad() # BaseInfoEnterpriseAbroad()
# NewsEnterprise_task() # NewsEnterprise_task()
# NewsEnterprise() NewsEnterprise()
# CorPerson() # CorPerson()
# china100() # china100()
# global100() # global100()
...@@ -630,6 +669,6 @@ if __name__ == "__main__": ...@@ -630,6 +669,6 @@ if __name__ == "__main__":
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
# FinanceFromEast() # FinanceFromEast()
ipo_code() # ipo_code()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===') log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
...@@ -23,10 +23,10 @@ es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn998 ...@@ -23,10 +23,10 @@ es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn998
index_name = 'researchreportdata' index_name = 'researchreportdata'
pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6) pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
def searchATT(title): def searchATT():
sql = "select id from clb_sys_attachment where type_id=4 and name=%s " sql = "select id from clb_sys_attachment where type_id=4 and create_time>'2023-12-08' "
# lock.acquire() # lock.acquire()
cursor_.execute(sql, title+'.pdf') cursor_.execute(sql)
selects = cursor_.fetchone() selects = cursor_.fetchone()
# lock.release() # lock.release()
return selects return selects
......
...@@ -45,7 +45,7 @@ def sendKafka(dic_news,xydm): ...@@ -45,7 +45,7 @@ def sendKafka(dic_news,xydm):
start_time = time.time() start_time = time.time()
try: # 114.116.116.241 try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("researchReportTopic", kafka_result = producer.send("researchReportYearTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8')) json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10)) print(kafka_result.get(timeout=10))
......
...@@ -31,7 +31,7 @@ pathType = 'QYYearReport/' ...@@ -31,7 +31,7 @@ pathType = 'QYYearReport/'
def sendKafka(dic_news): def sendKafka(dic_news):
try: # 114.116.116.241 try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("researchReportTopic", kafka_result = producer.send("researchReportYearTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8')) json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10)) print(kafka_result.get(timeout=10))
...@@ -93,74 +93,78 @@ def main(): ...@@ -93,74 +93,78 @@ def main():
redis_conn = redis.Redis(connection_pool=pool) redis_conn = redis.Redis(connection_pool=pool)
# info_ = redis_conn.lpop("NoIPO:info") # info_ = redis_conn.lpop("NoIPO:info")
info_ = '91310000132206289R|1725799077425945040|2022' info_list = ['91130100236018805C|18703781588|2018', '915203002147892034|18703781589|2013',
if info_: '913200007455797746|18703781592|2018', '91440500723817938W|18703781594|2019',
pass '91340000704920454F|18703781596|2021']
else: for info_ in info_list:
log.info("++++已没有数据++++")
if info_:
return pass
# info = info_.decode() else:
info = info_ log.info("++++已没有数据++++")
xydm = info.split('|')[0]
att_id = info.split('|')[1] return
year = info.split('|')[2] # info = info_.decode()
if not xydm or not year: info = info_
redis_conn.lpush('info', info) xydm = info.split('|')[0]
else: att_id = info.split('|')[1]
selects = secrchATT('1', xydm, year) year = info.split('|')[2]
if len(selects) > 1: if not xydm or not year:
redis_conn.lpush('NianBao:info', info) redis_conn.lpush('info', info)
elif len(selects) == 1: else:
# results = selectShortName(xydm) selects = secrchATT('1', xydm, year)
# if results: if len(selects) > 1:
# pass redis_conn.lpush('NianBao:info', info)
# else: elif len(selects) == 1:
# redis_conn.lpush('NoIPO:info', info) # results = selectShortName(xydm)
# return # if results:
select = selects[0] # pass
# name = results[3] # else:
name = select[1] # redis_conn.lpush('NoIPO:info', info)
if name: # return
# file_name = results[3] + ':' + year + '年年度报告' select = selects[0]
file_name = name.split('.')[0] # name = results[3]
else: name = select[1]
redis_conn.lpush('Noname:info', info) if name:
return # file_name = results[3] + ':' + year + '年年度报告'
log.info(f'-----------{file_name}-----------') file_name = name.split('.')[0]
origin = select[18] else:
create_time = select[13] redis_conn.lpush('Noname:info', info)
publishDate = select[21] return
if publishDate == '2023-12-31': log.info(f'-----------{file_name}-----------')
publishDate = '2023-08-31' origin = select[18]
file_href = 'http://zzsn.luyuen.com' + str(select[5]) create_time = select[13]
content = getContent(file_href) publishDate = select[21]
if publishDate == '2023-12-31':
lang = baseCore.detect_language(content) publishDate = '2023-08-31'
if lang == 'cn': file_href = 'http://zzsn.luyuen.com' + str(select[5])
lang = 'zh' content = getContent(file_href)
dic_info = {
'attachmentIds': att_id, lang = baseCore.detect_language(content)
'author': '', if lang == 'cn':
'content': content, lang = 'zh'
'contentWithTag': '', dic_info = {
'createDate': str(create_time), 'attachmentIds': att_id,
'deleteFlag': '0', 'author': '',
'id': '', 'content': content,
'keyWords': '', 'contentWithTag': '',
'lang': lang, 'createDate': str(create_time),
'origin': origin, 'deleteFlag': '0',
'publishDate': publishDate, 'id': '',
'sid': '1684032033495392257', 'keyWords': '',
'sourceAddress': '', # 原文链接 'lang': lang,
'summary': '', 'origin': origin,
'title': file_name, 'publishDate': publishDate,
'type': 1, 'sid': '1684032033495392257',
'socialCreditCode': xydm, 'sourceAddress': '', # 原文链接
'year': year 'summary': '',
} 'title': file_name,
sendKafka(dic_info) 'type': 1,
time.sleep(1) 'socialCreditCode': xydm,
'year': year
}
sendKafka(dic_info)
time.sleep(1)
def run_threads(num_threads): def run_threads(num_threads):
threads = [] threads = []
...@@ -178,7 +182,7 @@ def run_threads(num_threads): ...@@ -178,7 +182,7 @@ def run_threads(num_threads):
if __name__ == "__main__": if __name__ == "__main__":
while True: while True:
start = time.time() start = time.time()
num_threads = 5 num_threads =1
run_threads(num_threads) run_threads(num_threads)
log.info(f'5线程 总耗时{time.time() - start}秒') log.info(f'5线程 总耗时{time.time() - start}秒')
\ No newline at end of file
"""
将需要新增的企业入redis
"""
import json
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore()
chromedriver = "./chromedriver"
browser = webdriver.Chrome(chromedriver)
taskType = '上市信息/东方财富网'
log = baseCore.getLogger()
error_list = []
list_all_info = []
# 需要提供股票代码、企业信用代码
while True:
com_code1 = baseCore.redicPullData('Ipo_newsAdd:comCode')
start = time.time()
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,8开头的为北京交易所
if com_code1[0] == '2' or com_code1[0] == '0' or com_code1[0] == '3':
com_code = 'sz' + com_code1
if com_code1[0] == '9' or com_code1[0] == '6':
com_code = 'sh' + com_code1
if com_code1[0] == '8' or com_code1[0] == '4':
com_code = 'bj' + com_code1
if com_code1[0] == 'A':
com_code = ''
log.info(f'======开始采集{com_code}======')
url = f'https://quote.eastmoney.com/{com_code}.html'
url_1 = f'https://emweb.eastmoney.com/PC_HSF10/CompanySurvey/PageAjax?code={com_code}'
url_2 = f'https://emweb.eastmoney.com/PC_HSF10/BusinessAnalysis/PageAjax?code={com_code}'
# browser.get(url)
# time.sleep(8)
# page_source = browser.page_source
# soup_t = BeautifulSoup(page_source, 'html.parser')
# try:
# result = soup_t.find('div',class_='quote_quotenums').text
# # print(f'result:{result}')
# # if result=='未上市'or result=='已退市':
# if result == '未上市' :
# continue
# if result == '已退市':
# tag = 0
# else:
# tag = 1
# except Exception as e:
# error_list.append(com_code)
# log.info(f'={com_code}===解析上市状态失败=====')
# state = 0
# takeTime = baseCore.getTimeCost(start, time.time())
# baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析上市状态失败--e:{e}')
# print('error')
requests.adapters.DEFAULT_RETRIES = 5
json_1 = requests.get(url_1,verify=False).json()
json_2 = requests.get(url_2,verify=False).json()
# SECURITY_TYPE
try:
jys = json_1['jbzl'][0]['TRADE_MARKET']
except Exception as e:
log.info(f'====={com_code}=====解析交易所失败======')
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析交易所失败--e:{e}')
continue
try:
if "上海" in jys:
jys_code = '2'
if "深圳" in jys:
jys_code = '3'
except:
jys = json_1['jbzl'][0]['SECURITY_TYPE']
if "北京" in jys:
jys_code = '1'
short_name = json_1['jbzl'][0]['STR_NAMEA']
zhengquan_type = json_1['jbzl'][0]['SECURITY_TYPE']
# print(zhengquan_type)
if 'A' in zhengquan_type:
# print(zhengquan_type)
category = '1'
if 'B' in zhengquan_type:
category = '2'
if '新三板' in zhengquan_type:
category = '3'
if 'H' in zhengquan_type:
category = '4'
id_code = json_1['jbzl'][0]['REG_NUM']
dongcai = json_1['jbzl'][0]['EM2016']
zhengjian = json_1['jbzl'][0]['INDUSTRYCSRC1']
try:
shangshishijian = json_1['fxxg'][0]['LISTING_DATE'][:10]
except:
shangshishijian = ''
zhuyingfanwei = json_2['zyfw'][0]['BUSINESS_SCOPE']
dic_cwsj = {
"exchange": jys_code,
"category": category, # 股票类型(1-A股;2-B股;3-新三板;4-H股)
'listed': '1',
"listingDate": shangshishijian,
"securitiesCode": com_code[2:],
"securitiesShortName": short_name,
"securitiesType": zhengquan_type,
"socialCreditCode": id_code,
"businessScope": zhuyingfanwei,
"eastIndustry": dongcai,
"csrcIndustry": zhengjian
}
list_all_info.append(dic_cwsj)
log.info(f'======{com_code}====采集成功=====')
# 通过接口将数据保存进数据库
for num in range(0, len(list_all_info),100):
json_updata = json.dumps(list_all_info[num:num+100])
# print(json_updata)
try:
response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
verify=False)
except Exception as e:
print(e)
print("{}:到:{}".format(num, num + 100))
print(response.text)
...@@ -31,26 +31,50 @@ class EsMethod(object): ...@@ -31,26 +31,50 @@ class EsMethod(object):
def queryatt(self,index_name,pnum): def queryatt(self,index_name,pnum):
body = { body = {
"size":0, "query": {
"aggs":{ "bool": {
"duplicate_titles":{ "must": [
"terms":{ {
"field":"sourceAddress.keyword", "nested" : {
"min_doc_count":2, "query" : {
"size":1000 "bool" : {
}, "must" : [
"aggs":{ {
"duplicate_docs":{ "match_phrase" : {
"top_hits":{ "labels.relationId" : {
"_source":{ "query" : "1677"
"includes":["id","title","subjectId","sourceAddress","createDate"]
},
"size":10
}
}
}
} }
}
} }
]
}
},
"path" : "labels"
}
}
]
}
},
"size":0,
"aggs":{
"duplicate_titles":{
"terms":{
"field":"title.keyword",
"min_doc_count":2,
"size":1000
},
"aggs":{
"duplicate_docs":{
"top_hits":{
"_source":{
"includes":["id","title","subjectId","sourceAddress","createDate","labels.relationId","attachmentIds"]
},
"size":10
}
}
}
}
}
} }
# filter_path = ['hits.aggregations.duplicate_titles', # filter_path = ['hits.aggregations.duplicate_titles',
...@@ -66,6 +90,12 @@ class EsMethod(object): ...@@ -66,6 +90,12 @@ class EsMethod(object):
# log.info(result) # log.info(result)
return result return result
def delete(self, index_name, id):
result = self.es.delete(index=index_name
, doc_type="_doc"
, id=id)
log.info('删除结果 %s' % result)
def main(page, p, esMethod): def main(page, p, esMethod):
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p) result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
...@@ -75,18 +105,10 @@ def main(page, p, esMethod): ...@@ -75,18 +105,10 @@ def main(page, p, esMethod):
log.info('++++已没有数据+++++') log.info('++++已没有数据+++++')
return return
documents = result["aggregations"]["duplicate_titles"]["buckets"] documents = result["aggregations"]["duplicate_titles"]["buckets"]
for bucket in documents: unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
info_list = bucket["duplicate_docs"]["hits"]["hits"] # 删除重复的文档
for info in info_list: for doc_id in unique_document_ids:
esMethod.delete(index_name="policy", id=doc_id)
att_id_list = info['_source']['attachmentIds']
if len(att_id_list)==0:
unique_document_ids = info["_id"]
log.info(f'==={unique_document_ids}===')
# # 删除重复的文档
# for doc_id in unique_document_ids:
# esMethod.delete(index="policy", id=doc_id)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论