
# 1.无年份和信用代码 另外存一个redis的key中
# 2.有信用代码 id 年份的 （1）es 中的id 需要更新为附件表中的id
# （2）通过信用代码 查出名称相同的个数 如果有两个的话，说明其中有一个没有在es库中 需要把es库中的id获取到，并删除没有在es库中的那个记录
#如果有一条记录，就需要把该记录上传


#todo:查出有一条记录的，先更新 其他的先保存到另一个redis中
import json
import threading

import redis
import requests, re, time, pymysql,  fitz
import urllib3
from kafka import KafkaProducer

from base import BaseCore

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx.cursor()
lock = threading.Lock()

taskType = '企业年报'
pathType = 'QYYearReport/'

def sendKafka(dic_news):
    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
        kafka_result = producer.send("researchReportYearTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        log.info(dic_result)
        return False

def getContent(file_href):
    headers = {}
    content = ''
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(file_href, headers=headers, verify=False, timeout=20)
            break
        except:
            time.sleep(3)
            continue

    with fitz.open(stream=response.content, filetype='pdf') as doc:
        page_size = doc.page_count
        log.info(f'当前页码----{page_size}')
        for page in doc.pages():
            content += page.get_text()
    return content

def secrchATT(type_id, xydm, year):

    sel_sql = '''select * from clb_sys_attachment where item_id=%s and type_id=%s and year=%s'''
    lock.acquire()
    cursor_.execute(sel_sql, (xydm, type_id, year))
    selects = cursor_.fetchall()
    lock.release()
    return selects

def selectShortName(xydm):
    sel_sql = "select * from sys_base_enterprise where social_credit_code = %s"
    lock.acquire()
    cursor_.execute(sel_sql, xydm)
    selects = cursor_.fetchone()
    lock.release()
    return selects

def main():
    redis_conn = redis.Redis(connection_pool=pool)

    # info_ = redis_conn.lpop("NoIPO:info")
    info_list = ['91130100236018805C|18703781588|2018', '915203002147892034|18703781589|2013',
                 '913200007455797746|18703781592|2018', '91440500723817938W|18703781594|2019',
                 '91340000704920454F|18703781596|2021']
    for info_ in info_list:

        if info_:
            pass
        else:
            log.info("++++已没有数据++++")

            return
        # info = info_.decode()
        info = info_
        xydm = info.split('|')[0]
        att_id = info.split('|')[1]
        year = info.split('|')[2]
        if not xydm or not year:
            redis_conn.lpush('info', info)
        else:
            selects = secrchATT('1', xydm, year)
            if len(selects) > 1:
                redis_conn.lpush('NianBao:info', info)
            elif len(selects) == 1:
                # results = selectShortName(xydm)
                # if results:
                #     pass
                # else:
                #     redis_conn.lpush('NoIPO:info', info)
                #     return
                select = selects[0]
                # name = results[3]
                name = select[1]
                if name:
                    # file_name = results[3] + ':' + year + '年年度报告'
                    file_name = name.split('.')[0]
                else:
                    redis_conn.lpush('Noname:info', info)
                    return
                log.info(f'-----------{file_name}-----------')
                origin = select[18]
                create_time = select[13]
                publishDate = select[21]
                if publishDate == '2023-12-31':
                    publishDate = '2023-08-31'
                file_href = 'http://zzsn.luyuen.com' + str(select[5])
                content = getContent(file_href)

                lang = baseCore.detect_language(content)
                if lang == 'cn':
                    lang = 'zh'
                dic_info = {
                    'attachmentIds': att_id,
                    'author': '',
                    'content': content,
                    'contentWithTag': '',
                    'createDate': str(create_time),
                    'deleteFlag': '0',
                    'id': '',
                    'keyWords': '',
                    'lang': lang,
                    'origin': origin,
                    'publishDate': publishDate,
                    'sid': '1684032033495392257',
                    'sourceAddress': '',  # 原文链接
                    'summary': '',
                    'title': file_name,
                    'type': 1,
                    'socialCreditCode': xydm,
                    'year': year
                }
                sendKafka(dic_info)
                time.sleep(1)

def run_threads(num_threads):
    threads = []

    for i in range(num_threads):

        thread = threading.Thread(target=main, args=())

        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

if __name__ == "__main__":
    while True:
        start = time.time()
        num_threads =1
        run_threads(num_threads)

        log.info(f'5线程  总耗时{time.time() - start}秒')