import json
import time
import uuid

import pymysql
import redis
import requests
from kafka import KafkaProducer

import urllib3
urllib3.disable_warnings()
from obs import ObsClient
import fitz

import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=5)
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx = baseCore.cnx_
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)

pathType = 'CrowDingZhi/'
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_session_id_194d0e44-fe9b-48e5-b10a-8ed88066d31e=true; ba17301551dcbaf9_gdp_session_id_6b4b8111-8bf8-454e-9095-e16e285874b9=true; ba17301551dcbaf9_gdp_session_id_1bb9733b-f7c9-4f8d-b375-d393646e7329=true; ba17301551dcbaf9_gdp_session_id_7c08264f-759e-4cf8-b60b-ba1894f4a647=true; ba17301551dcbaf9_gdp_session_id_cbae63ce-6754-4b86-80e8-435ec24dde71=true; ba17301551dcbaf9_gdp_session_id_371e25f6-19a8-4e37-b3a9-fafb0236b2ac=true; ba17301551dcbaf9_gdp_session_id_d5257d90-edc8-4bd6-9625-d671f80c853f=true; ba17301551dcbaf9_gdp_session_id_26c35bee-808e-4a4d-a3dd-25ad65896727=true; ba17301551dcbaf9_gdp_session_id=c1b0f1df-857f-413a-b51b-2f7fda8bb882; ba17301551dcbaf9_gdp_session_id_c1b0f1df-857f-413a-b51b-2f7fda8bb882=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:220%2C%22VISIT%22:11%2C%22PAGE%22:23%2C%22CUSTOM%22:69%2C%22VIEW_CLICK%22:118%2C%22VIEW_CHANGE%22:3}',
    'Host': 'query.sse.com.cn',
    'Referer': 'http://www.sse.com.cn/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}


def convert_size(size_bytes):
    # 定义不同单位的转换值
    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
    i = 0
    while size_bytes >= 1024 and i < len(units) - 1:
        size_bytes /= 1024
        i += 1
    return f"{size_bytes:.2f} {units[i]}"


def getuuid():
    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
    return get_timestamp_uuid


# 数据入库，返回主键id传到kafka中
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
                create_by, create_time, come, page_size):
    with cnx.cursor() as cursor:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
        year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
        create_time, come, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn')
        # log.info(values)
        cursor.execute(Upsql, values)  # 插入
        cnx.commit()  # 提交

        querySql = '''select id from clb_sys_attachment where type_id=15 and full_path = %s'''  # and stock_code = "01786.HK"
        cursor.execute(querySql, full_path)
        selects = cursor.fetchone()
        pdf_id = selects[0]
    # cnx.close()
    # log.info("更新完成:{}".format(pdf_id))
    return pdf_id

def uptoOBS(pdf_url, name_pdf, type_id, pathType, category):
    retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
               'full_path': '',
               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    for i in range(0, 3):
        try:
            ip = baseCore.get_proxy()
            # response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
            response = requests.get(pdf_url)
            file_size = int(response.headers.get('Content-Length'))
            break
        except:
            time.sleep(3)
            continue
    for i in range(0, 3):
        try:
            name = str(getuuid()) + '.' + category
            now_time = time.strftime("%Y-%m")
            result = obsClient.putContent('zzsn', pathType + name, content=response.content)
            if category == 'pdf':
                with fitz.open(stream=response.content, filetype='pdf') as doc:
                    page_size = doc.page_count
                    for page in doc.pages():
                        retData['content'] += page.get_text()
                break
            else:
                page_size = 0
                retData['content'] = ''
                break
        except Exception as e:
            time.sleep(3)
            continue
    try:
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        retData['state'] = True
        retData['path'] = result['body']['objectUrl'].split('.com')[1]
        retData['full_path'] = result['body']['objectUrl']
        retData['file_size'] = convert_size(file_size)
        retData['create_time'] = time_now
        retData['page_size'] = page_size
    except Exception as e:
        log.info(f'error---{e}')
        return retData

    return retData

if __name__ == "__main__":
    num = 0
    t = int(time.time()*1000)
    url_ = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
    req_ = requests.get(url=url_, headers=headers)
    data_json = req_.json()
    print(data_json)
    pageCount = data_json['pageHelp']['pageCount']
    for i in range(1,int(pageCount + 1)):
        url = f'http://query.sse.com.cn/commonSoaQuery.do?&isPagination=true&pageHelp.pageSize=25&pageHelp.pageNo={i}&pageHelp.beginPage={i}&pageHelp.cacheSize=1&pageHelp.endPage={i}&sqlId=BS_KCB_GGLL&siteId=28&channelId=10007%2C10008%2C10009%2C10010&type=&stockcode=&extTeacher=&extWTFL=&createTime=&createTimeEnd=&order=createTime%7Cdesc%2Cstockcode%7Casc&_={t}'
        req = requests.get(url=url, headers=headers)
        data_list = req.json()['result']
        for info in data_list:
            publishDate = info['cmsOpDate']  # 处理日期
            year = publishDate[:4]
            com = '上海证券交易所'
            docTitle = info['docTitle']  # 处理事由
            docType = info['docType']  # 文档类型
            docURL = "http://" + info['docURL']  # 链接 http://www.sse.com.cn/disclosure/credibility/supervision/measures/focus/c/f409d7c0-2726-47d1-ac5e-120a9cdb0727.pdf
            flag = r.sismember('IN-20231227-0001', docURL)
            if flag:
                log.info('信息已采集入库过')
                continue
            # 上传至obs
            retData = uptoOBS(docURL, docTitle, 15, pathType, docType)
            if retData['state']:
                pass
            else:
                log.info(f'====pdf解析失败====')
                continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            page_size = retData['page_size']
            path = retData['path']
            full_path = retData['full_path']
            file_size = retData['file_size']
            create_by = retData['create_by']
            content = retData['content']
            status = 1
            num += 1
            create_time = time_now
            # 上传到附件表
            att_id = tableUpdate(year, docTitle+'.'+docType, 15, '', '', path, full_path, docType, file_size, num, status, create_by, create_time, com, page_size)
            if att_id:
                pass
            else:
                continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            sid = '1739914218978594817'
            info_code = "IN-20231227-0001"

            dic_news = {
                'attachmentIds': str(att_id),
                'content': content,
                'contentWithTag': '',
                'id': '',
                'origin': com,
                'publishDate': publishDate,
                'sid': sid,
                'sourceAddress': docURL,
                'title': docTitle,
                'source':'16',
                'type': ''
            }

            # 将相应字段通过kafka传输保存
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                kafka_result = producer.send("crawlerInfo",
                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

                log.info(kafka_result.get(timeout=10))
            except Exception as e:
                log.info(e)
                log.info(f'传输失败：{dic_news["title"]}、{dic_news["publishDate"]}')
            dic_result = {
                'success': 'ture',
                'message': '操作成功',
                'code': '200',
            }
            log.info(dic_result)
            r.sadd(info_code, docURL)
            continue
