import datetime
import os
import time
import uuid
from urllib.parse import unquote, urljoin

import pymongo
import requests
from bs4 import BeautifulSoup
from fitz import fitz
from obs import ObsClient
from retry import retry

from base import BaseCore
from requests.packages.urllib3 import disable_warnings

disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
    'REITsFundAnncmnt']
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()


class obsOperate():
    def __init__(self, cursor_, cnx_, log):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        }
        self.cursor_ = cursor_
        self.cnx_ = cnx_
        self.log = log

    def secrchATT(self, item_id, file_name, type_id, order_by):
        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
        self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
        selects = self.cursor_.fetchone()
        return selects

    # 插入到att表 返回附件id
    def tableUpdate(self, retData, com_name, file_name, num, pub_time):
        item_id = retData['item_id']
        type_id = retData['type_id']
        group_name = retData['group_name']
        path = retData['path']
        full_path = retData['full_path']
        category = retData['category']
        file_size = retData['file_size']
        status = retData['status']
        create_by = retData['create_by']
        page_size = retData['page_size']
        create_time = retData['create_time']
        order_by = num

        Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, path, 'zzsn', pub_time)

        self.cursor_.execute(Upsql, values)  # 插入
        self.cnx_.commit()  # 提交
        self.log.info("更新完成:{}".format(Upsql))
        selects = self.secrchATT(item_id, file_name, type_id, order_by)
        id = selects[0]
        return id, full_path

    def getuuid(self):
        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self, size_bytes):
        # 定义不同单位的转换值
        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
        i = 0
        while size_bytes >= 1024 and i < len(units) - 1:
            size_bytes /= 1024
            i += 1
        return f"{size_bytes:.2f} {units[i]}"

    @retry(tries=5, delay=10)
    def getRes(self, file_href):
        response = requests.get(file_href, headers=self.headers)
        if response.status_code != 200:
            raise
        return response

    @retry(tries=5, delay=10)
    def sendOBS(self, file_name, response):
        result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
        return result

    def uptoOBS(self, file_href, item_id, file_name):

        category = os.path.splitext(file_href)[1]
        retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
                   'create_time': '', 'page_size': '', 'content': ''}
        try:
            response = self.getRes(file_href)
        except:
            self.log.error('文件获取失败')
            return retData

        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            self.log.error(f'文件解析失败')
            return retData

        if 'If this message is not eventually replaced by the proper contents of the document, your PDF' in retData[
            'content']:
            retData['state'] = True
            return retData

        file_size = int(response.headers.get('Content-Length'))
        file_name = str(self.getuuid()) + category
        try:
            result = self.sendOBS(file_name, response)
        except:
            self.log.error(f'obs上传失败')
            return retData

        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = self.convert_size(file_size)
            retData['create_time'] = time_now
        except Exception as e:
            print(f'error:{e}')
            return retData
        log.info(f'{file_name}===obs上传成功')
        return retData


def paserUrl(html, listurl):
    # 获取所有的<a>标签和<img>标签
    if isinstance(html, str):
        html = BeautifulSoup(html, 'html.parser')

    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html


def getCodeList():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    }
    code_list = []
    url = 'https://api.sgx.com/securities/v1.1?params=nc%2Cn%2Ctype%2Cls%2Cm%2Csc%2Cbl%2Csip%2Cex%2Cej%2Cclo%2Ccr%2Ccur%2Cel%2Cr%2Ci%2Ccc%2Cig%2Clf'
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    data_json = req.json()['data']['prices']
    for data_ in data_json:
        type = data_['type']
        if type == 'reits':
            TradingCode = data_['nc']
            TradingName = data_['n']
            code_list.append([TradingCode, TradingName])
    req.close()
    return code_list


def getData(code, name, obsOperate, data):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    }
    path_list = []
    content = []
    url = data['url']
    title = data['title'].split('::')[0].strip()
    date = datetime.datetime.strptime(data['submission_date'], '%Y%m%d')
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    soup = paserUrl(soup, url)
    contentWithTag = soup.find('div', class_='announcement')
    dd_list = contentWithTag.find('dl', class_='announcement-attachment-list').find_all('dd')
    num = 1
    for dd in dd_list:
        try:
            href = dd.find('a').get('href')
        except:
            continue
        if not href:
            continue
        file_title = title + '.pdf'
        retData = obsOperate.uptoOBS(href, code, file_title)
        time.sleep(2)
        if retData['state']:
            pass
        else:
            log.error(f'{title}===公告下载obs失败')
            continue
        if 'If this message is not eventually replaced by the proper contents of the document, your PDF' not in retData['content']:
            att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
            path_list.append(full_path)
            content.append(retData['content'])
            num += 1
    if len(path_list) == 1:
        dic_news = {
            'code': code,  # 代码
            'name': name,  # 简称
            'title': title,  # 名称
            'path': path_list[0],  # obs路径
            'href': url,  # 原文链接
            'content': content[0],  # pdf解析内容
            'date': date,  # 时间
            'strDate': str(date)[:10],  # 时间 字符串
            'exchange': '新加坡交易所'  # 交易所
        }
    elif len(path_list) > 1:
        dic_news = {
            'code': code,  # 代码
            'name': name,  # 简称
            'title': title,  # 名称
            'path': path_list[0],  # obs路径
            'href': url,  # 原文链接
            'content': content[0],  # pdf解析内容
            'date': date,  # 时间
            'strDate': str(date)[:10],  # 时间 字符串
            'exchange': '新加坡交易所',  # 交易所
            'pathList': path_list  # 附件集合
        }
    else:
        dic_news = {}
    req.close()
    return dic_news


def getDataJson(code):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'Authorizationtoken': '3ZvSQB8eN/rzx9RVgRBlUMr1Q8vf/mzVVGAAJEo67btKmiM25Tl8WAs5w3dYfM+5mQkSGylxOTCevcn7LwxJSjs5nFi4Pm6KHZjZ1XNgvJPOJ4XF9xO3NmXFvf395xd1',
    }
    now = datetime.datetime.now().strftime('%Y%m%d')
    url = f'https://api.sgx.com/announcements/v1.1/securitycode?periodstart=20191204_000000&periodend={now}_235959&cat=ANNC&value={code}&exactsearch=true&pagestart=0&pagesize=250'
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    data_json = req.json()['data']
    req.close()
    return data_json


def doJob(obsOperate):
    try:
        code_list = getCodeList()
    except:
        log.error(f'代码列表获取失败')
        return
    for codes in code_list:
        code = codes[0]
        name = codes[1]
        log.info(f'{code}===开始采集')
        try:
            data_json = getDataJson(code)
        except Exception as e:
            log.error(f'{code}===信息列表获取失败')
            continue
        for data_ in data_json:
            is_insert = db_storage.find_one({'code': code, 'href': data_['url'], 'exchange': '新加坡交易所'})
            if is_insert:
                log.info(f'{code}==={data_["url"]}===已采集')
                time.sleep(3)
                continue
            dic_info = getData(code, name, obsOperate, data_)
            if dic_info:
                db_storage.insert_one(dic_info)
                log.info(f'{code}==={data_["url"]}===采集成功')
            time.sleep(3)


if __name__ == '__main__':
    obsOperate = obsOperate(cursor_, cnx_, log)
    doJob(obsOperate)
    baseCore.close()
