import datetime
import re
import time

import numpy as np
import pandas as pd
import pymongo
import requests
import os
import json
import uuid
from urllib.parse import unquote

from fitz import fitz
from kafka import KafkaProducer
from obs import ObsClient
from retry import retry

from base import BaseCore

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
    'REITsFundAnncmnt']
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'query.sse.com.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.sse.com.cn/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
headers_ = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


class obsOperate():
    def __init__(self, cursor_, cnx_, log):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'www.sse.com.cn',
            'Pragma': 'no-cache',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        }
        self.cursor_ = cursor_
        self.cnx_ = cnx_
        self.log = log

    def secrchATT(self, item_id, file_name, type_id, order_by):
        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
        self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
        selects = self.cursor_.fetchone()
        return selects

    # 插入到att表 返回附件id
    def tableUpdate(self, retData, com_name, file_name, num, pub_time):
        item_id = retData['item_id']
        type_id = retData['type_id']
        group_name = retData['group_name']
        path = retData['path']
        full_path = retData['full_path']
        category = retData['category']
        file_size = retData['file_size']
        status = retData['status']
        create_by = retData['create_by']
        page_size = retData['page_size']
        create_time = retData['create_time']
        order_by = num

        Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, path, 'zzsn', pub_time)

        self.cursor_.execute(Upsql, values)  # 插入
        self.cnx_.commit()  # 提交
        self.log.info("更新完成:{}".format(Upsql))
        selects = self.secrchATT(item_id, file_name, type_id, order_by)
        id = selects[0]
        return id, full_path

    def getuuid(self):
        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self, size_bytes):
        # 定义不同单位的转换值
        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
        i = 0
        while size_bytes >= 1024 and i < len(units) - 1:
            size_bytes /= 1024
            i += 1
        return f"{size_bytes:.2f} {units[i]}"

    @retry(tries=5, delay=10)
    def getRes(self, file_href):
        response = requests.get(file_href, headers=self.headers)
        if response.status_code != 200:
            raise
        return response

    @retry(tries=5, delay=10)
    def sendOBS(self, file_name, response):
        result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
        return result

    def uptoOBS(self, file_href, item_id, file_name):

        category = os.path.splitext(file_href)[1]
        retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
                   'create_time': '', 'page_size': '', 'content': ''}
        try:
            response = self.getRes(file_href)
        except:
            self.log.error('文件获取失败')
            return retData

        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            self.log.error(f'文件解析失败')
            return retData

        file_size = int(response.headers.get('Content-Length'))
        file_name = str(self.getuuid()) + category
        try:
            result = self.sendOBS(file_name, response)
        except:
            self.log.error(f'obs上传失败')
            return retData

        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = self.convert_size(file_size)
            retData['create_time'] = time_now
        except Exception as e:
            print(f'error:{e}')
            return retData
        return retData


# 获取json数据
@retry(tries=5, delay=15)
def getJson(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    data_json = re.findall('\((.*)\)', req.text)[0]
    data_json = json.loads(data_json)
    req.close()
    return data_json


# 获取总页数
def getTotal():
    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
    data_json = getJson(url)
    total = int(data_json['pageHelp']['pageCount'])
    return total


# 获取pdf文件的基本信息
def getDataList(page):
    info_list = []
    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&_={int(time.time())}'
    data_json = getJson(url)['result']
    for data in data_json:
        name = data['fundExtAbbr']
        title = data['title']
        pub_time = data['sseDate']
        code = data['securityCode']
        href = 'http://www.sse.com.cn' + data['url'].replace('\\', '')
        info_list.append([title, pub_time, href, name, code])
    return info_list


def doJob(obsOperate):
    total = getTotal()
    log.info(f'共{total}页')
    num = 0
    for page in range(1, total + 1):
        log.info(f'开始采集第{page}页')
        try:
            info_list = getDataList(page)
        except Exception as e:
            log.error(f'第{page}页数据获取失败==={e}')
            time.sleep(5)
            continue
        for info in info_list:
            title = info[0]
            pub_time = info[1]
            href = info[2]
            name = info[3]
            code = info[4]
            is_insert = db_storage.find_one({'code': code, 'href': href, 'exchange': '上海证券交易所'})
            if is_insert:
                log.info(f'{title}===已采集')
                time.sleep(2)
                continue
            file_title = title + '.pdf'
            retData = obsOperate.uptoOBS(href, f'{code}.SH', file_title)
            time.sleep(2)
            if retData['state']:
                pass
            else:
                log.error(f'{title}===公告下载obs失败')
                continue
            att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, pub_time)
            dic_news = {
                'code': code,  # 代码
                'name': name,  # 简称
                'title': title,  # 名称
                'path': full_path,  # obs路径
                'href': href,  # 原文链接
                # 'content':content,
                'content':retData['content'],   # pdf解析内容
                'date': datetime.datetime.strptime(pub_time, '%Y-%m-%d'),  # 时间
                'strDate': pub_time[:10],  # 时间 字符串
                'exchange': '上海证券交易所'  # 交易所
            }
            # print(dic_news)
            num += 1
            try:
                db_storage.insert_one(dic_news)
                log.info(f'{title}===采集成功')
            except:
                log.error(f'{title}===入库失败')
            time.sleep(4)


if __name__ == '__main__':
    obsOperate = obsOperate(cursor_, cnx_, log)
    doJob(obsOperate)
    baseCore.close()
