import requests,time, json, sys
from kafka import KafkaProducer
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor

cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_

obsClient = ObsClient(
        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )

#获取文件大小
def convert_size(size_bytes):
    # 定义不同单位的转换值
    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
    i = 0
    while size_bytes >= 1024 and i < len(units)-1:
        size_bytes /= 1024
        i += 1
    return f"{size_bytes:.2f} {units[i]}"

def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
               'full_path': '',
               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
            file_size = int(response.headers.get('Content-Length'))
            break
        except:
            time.sleep(3)
            continue
    page_size = 0
    for i in range(0, 3):
        try:
            name = pdf_name
            now_time = time.strftime("%Y-%m")
            result = obsClient.putContent('zzsn', 'QYNotice/'+name, content=response.content)
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                page_size = doc.page_count
                for page in doc.pages():
                    retData['content'] += page.get_text()
            break
        except:
            time.sleep(3)
            continue

    if page_size < 1:
        # pdf解析失败
        # print(f'======pdf解析失败=====')
        return retData
    else:
        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = convert_size(file_size)
            retData['create_time'] = time_now
            retData['page_size'] = page_size
        except Exception as e:
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData

        return retData

def secrchATT(item_id, name, type_id,order_by):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
    cursor_.execute(sel_sql, (item_id, name+'.pdf', type_id,order_by))
    select = cursor_.fetchall()
    selects = select[-1]
    return selects

# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    # selects = secrchATT(item_id, pdf_name, type_id)
    #
    # if selects:
    #     log.info(f'pdf_name:{pdf_name}已存在')
    #     id = ''
    #     return id
    # else:
    try:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
    except Exception as e:
        print(e)
    log.info(f"更新完成:{item_id}===={pdf_name+'.pdf'}")
    selects = secrchATT(item_id, pdf_name, type_id,order_by)
    id = selects[0]
    return id


def InsterInto(social_code, pdf_url,pub_time,pdf_name):
    insert = False
    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''

        list_info = [
            social_code,
            pdf_url,
            '东方财富网',
            '1',
            pub_time,
            pdf_name
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
        cnx.commit()
        insert = True
        return insert
    except:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert


def ifInstert(short_name, social_code, pdf_url):
    ifexist = True

    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    #如果数据库中存在 则跳过
    if selects:
        ifexist = False
        log.info(f'com_name:{short_name}、{pdf_url}已存在')
        return ifexist
    else:
        return ifexist

def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
    # 上传至华为云服务器
    retData = uptoOBS(pdf_url, pdf_name, 8, social_code)
    # 附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
    num = num + 1
    att_id = tableUpdate(retData, com_name, year, pdf_name, num)
    if att_id:
        pass
    else:
        return False
    content = retData['content']

    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': att_id,
        'author': '',
        'content': content,
        'contentWithTag': '',
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': 'zh',
        'origin': '东方财富网',
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': info_url,  # 原文链接
        'summary': '',
        'title': pdf_name.replace('.pdf', ''),
        'type': 3,
        'socialCreditCode': social_code,
        'year': year
    }
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("researchReportTopicaaaas",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
        log.info(dic_result)
        return False


def gonggao_info(dic_info):
    code = dic_info[3]
    com_name = dic_info[4]
    social__code = dic_info[2]
    if 'HK' in code:
        # browser.quit()
        return
    code1 = str(code)

    while True:
        if len(code1) < 6:
            code1 = '0' + code1
        else:
            break

    if code1[0] == '0' or code1[0] == '3' or code[0] == '2':
        com_code = 'SZ' + code1
    elif code1[0] == '6' or code1[0] == '9':
        com_code = 'SH' + code1
    elif code1[0] == '8' or code1[0] == '4':
        com_code = 'BJ' + code1

    break_id = 0
    for page1 in range(1, 100):
        if break_id == 1:
            break
        url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index={page1}&ann_type=A&client_source=web&stock_list={code1}&f_node=0&s_node=0'

        for n1 in range(0, 3):
            try:
                res = requests.get(url, verify=False)
                break
            except:
                if n1 == 2:
                    sys.exit(0)
                time.sleep(5)
                continue

        res_json = res.json()
        list_all = res_json['data']['list']
        if list_all:
            for one_info in list_all:
                title = one_info['title']
                info_date = one_info['notice_date']
                year = info_date[:4]
                if page1 > 1 and '2022' in info_date:
                    break_id = 1
                    break
                if '2021' in info_date:  # 只采集22年以后的数据
                    break_id = 1
                    break

                try:
                    info_type = one_info['columns'][0]['column_name']
                except:
                    info_type = ''
                art_code = one_info['art_code']
                info_url = 'https://data.eastmoney.com/notices/detail/' + com_code + '/' + art_code + '.html'
                t = int(time.time() * 1000)
                json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'

                for n1 in range(0, 3):
                    try:
                        json_2 = requests.get(json_url, verify=False).json()
                        break
                    except:
                        if n1 == 2:
                            sys.exit(0)
                        time.sleep(5)
                        continue
                try:
                    pdf_url = json_2['data']['attach_url']
                except:
                    pdf_url = ''
                try:
                    info_content = json_2['data']['notice_content']
                except:
                    info_content = ''
                ifexist = ifInstert(com_name, social_code, info_url)
                if ifexist:
                    # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
                    result = GetContent(pdf_url, info_url,title, social_code, year, info_date, start_time, com_name, num)
                    if result:
                        # 公告信息列表
                        log.info(f'{com_name}==============解析传输操作成功')
                        state = 1
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')

                        # 发送kafka成功之后 再插入数据库
                        insert = InsterInto(social_code, pdf_url, info_date, title)
                        if insert:
                            log.info(f'===={social_code}========{title}=====插入库成功')
                        pass
                    else:
                        continue
                else:
                    log.info(f'======={com_name}========{code}===已存在')
                    continue

        #         list_info = [
        #             social_code,
        #             title,
        #             info_content[:2000],
        #             info_date,
        #             info_url,
        #             pdf_url,
        #             '东方财富网',
        #             '1',
        #             'zh'
        #         ]
        #         # list_all_info.append(tuple(list_info))
        #         with cnx.cursor() as cursor:
        #             sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s '''
        #
        #             cursor.execute(sel_sql, info_url)
        #             selects = cursor.fetchall()
        #             if selects:
        #                 break
        #             else:
        #                 #todo:取消入库操作
        #                 insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,publish_date,source_address,pdf_address,origin,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
        #                 cursor.execute(insert_sql, tuple(list_info))
        #                 cnx.commit()
        # else:
        #     break

if __name__ =='__main__':
    #从redis中读取social_code'

    list_c = []
    list_all_info_1 = []
    num = 0
    taskType = '企业公告/东方财富网'
    while True:
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('NoticeEnterpriseEasteFinance:gnshqy_socialCode')
        social_code = '911100007109288314'
        if not social_code:
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        if social_code == '':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        count = dic_info[15]
        code = dic_info[3]
        com_name = dic_info[4]
        log.info(f'-----开始处理{com_name}----{social_code}------')
        gonggao_info(dic_info)



