"""
读取文件 path = 'D:\kkwork\zzsn_spider\data\'
"""
import json
import os
import time

from kafka import KafkaProducer
from obs import ObsClient
import fitz
from urllib.parse import unquote
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from retry import retry

obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
import requests

import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
pathType = 'QYYearReport/'
type_id = 1
create_by = 'XueLingKun'
taskType = '企业年报'
# file_path = 'D:/kkwork/zzsn_spider/data/1_福布斯2000_PDF_50_郑'


def sendKafka(dic_news):
    start_time = time.time()
    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("researchReportYearTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        # 传输成功,写入日志中
        state = 1
        takeTime = baseCore.getTimeCost(start_time, time.time())
        return True

    except Exception as e:

        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        log.error(dic_result)
        e = 'Kafka操作失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        return False

def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_url):

    state = retData['state']
    type_id = retData['type_id']
    social_code = retData['item_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    create_time = retData['create_time']
    page_size = retData['page_size']
    content = retData['content']
    header = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    }
    retData_f = {'state': state, 'type_id': type_id, 'item_id': social_code, 'group_name': group_name, 'path': path,
               'full_path': full_path,
               'category': category, 'file_size': file_size, 'status': status, 'create_by': create_by,
               'create_time': create_time, 'page_size': page_size, 'content': content}
    # headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(pdf_url, headers=header, timeout=30)
            break
        except Exception as e:
            time.sleep(3)
            continue
    try:
        result = getOBSres(pathType, file_name, response)
    except Exception as e:
        log = baseCore.getLogger()
        log.error(f'OBS发送失败')
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}---OBS发送失败')
        return retData

    with fitz.open(stream=response.content, filetype='pdf') as doc:
        page_size = doc.page_count
        log = baseCore.getLogger()
        log.info(f'当前页码----{page_size}')
        for page in doc.pages():
            retData_f['content'] += page.get_text()
    try:
        req = requests.head(pdf_url)
        file_size = int(req.headers.get('Content-Length',0))
    except:
        file_size = 0
    if page_size < 1:
        # pdf解析失败
        # print(f'======pdf解析失败=====')
        return retData
    try:
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        retData_f['state'] = True
        retData_f['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
        retData_f['full_path'] = unquote(result['body']['objectUrl'])
        retData_f['create_time'] = time_now
        retData_f['file_size'] = baseCore.convert_size(file_size)
        retData_f['page_size'] = page_size
    except Exception as e:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
        return retData_f
    return retData_f


@retry(tries=3, delay=1)
def getOBSres(pathType, name, response):
    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
    # result = obsClient.putFile('zzsn', pathType+name, file_path=response)
    return result

if __name__=='__main__':
    while True:
        start_time = time.time()

        id = baseCore.redicPullData('fbspdfinfo:id')
        # id = 537
        selectsql = f"select * from fbspdfinfo where id={id}"
        cursor.execute(selectsql)
        data = cursor.fetchone()
        num = 1
        if data:
            social_code = data[1]
            ename = data[2]
            cname = data[3]
            year = data[4]
            pdf_url = data[5]
            file_name = cname + year +'年年度报告' +'.pdf'
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
                       'full_path': '',
                       'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
                       'create_time': time_now, 'page_size': '', 'content': ''}
            log.info(f'开始处理{ename}---{social_code}')
            #文件上传至文件服务器
            #annualreports 英国的年报来源
            try:
                retData_f = uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_url)
                if retData_f['state']:
                    content = retData_f['content']
                    page_size = retData_f['page_size']
                    log.info(f'当前页数为{page_size}')
                    #retData, com_name, year, pdf_name, num, pub_time
                    att_id= baseCore.tableUpdate(retData_f, cname,year,file_name, num,year + '-12-31')
                    if att_id:
                        dic_news = {
                            'attachmentIds': att_id,
                            'author': '',
                            'content': content,
                            'contentWithTag': '',
                            'createDate': time_now,
                            'deleteFlag': '0',
                            'id': '',
                            'keyWords': '',
                            'lang': 'zh',
                            'origin': '企业官网',
                            'publishDate': year + '-12-31',
                            'sid': '1684032033495392257',
                            'sourceAddress': pdf_url,  # 原文链接
                            'summary': '',
                            'title': file_name,
                            'type': 1,
                            'socialCreditCode': social_code,
                            'year': year
                        }
                        if sendKafka(dic_news):
                            #100表示成功
                            updatesql = f"update fbspdfinfo set state=100 where pdf_url = '{pdf_url}'"
                            cursor.execute(updatesql)
                            cnx.commit()
                        else:
                            #删除插入的数据 400表示发送数据失败
                            baseCore.deliteATT(att_id)
                            log.info(f'已删除插入附件表的数据---{pdf_url}-----{social_code}')
                            updatesql = f"update fbspdfinfo set state=400 where pdf_url = '{pdf_url}'"
                            cursor.execute(updatesql)
                            cnx.commit()
            except Exception as e:
                log.info(f'error------{e}')