import os
import re
import time
import uuid
from urllib.parse import urljoin, unquote

from fitz import fitz
from obs import ObsClient
from retry import retry

import BaseCore
import requests
from bs4 import BeautifulSoup
from requests.packages.urllib3 import disable_warnings

disable_warnings()

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
enMonth = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',

}

webname = 'Nareit官网'


class obsOperate():
    def __init__(self, cursor_, cnx_, log):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
            'Cache-Control': 'no-cache',
            'Pragma': 'no-cache',
            'Referer': 'https://www.reit.com/',
            'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'cross-site',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        }
        self.cursor_ = cursor_
        self.cnx_ = cnx_
        self.log = log

    def secrchATT(self, full_path):
        sel_sql = '''select id from clb_sys_attachment where full_path=%s '''
        self.cursor_.execute(sel_sql, (full_path))
        selects = self.cursor_.fetchone()
        return selects

    # 插入到att表 返回附件id
    def tableUpdate(self, retData, com_name, file_name, num, pub_time):
        item_id = retData['item_id']
        type_id = retData['type_id']
        group_name = retData['group_name']
        path = retData['path']
        full_path = retData['full_path']
        category = retData['category']
        file_size = retData['file_size']
        status = retData['status']
        create_by = retData['create_by']
        page_size = retData['page_size']
        create_time = retData['create_time']
        order_by = num
        Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, path, 'zzsn', pub_time)

        self.cursor_.execute(Upsql, values)  # 插入
        self.cnx_.commit()  # 提交
        self.log.info("更新完成:{}".format(Upsql))
        selects = self.secrchATT(full_path)
        id = selects[0]
        return id, full_path

    def getuuid(self):
        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self, size_bytes):
        # 定义不同单位的转换值
        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
        i = 0
        while size_bytes >= 1024 and i < len(units) - 1:
            size_bytes /= 1024
            i += 1
        return f"{size_bytes:.2f} {units[i]}"

    @retry(tries=5, delay=10)
    def getRes(self, file_href):
        response = requests.get(file_href, headers=self.headers)
        if response.status_code != 200:
            raise
        return response

    @retry(tries=5, delay=10)
    def sendOBS(self, file_name, response):
        result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
        return result

    def uptoOBS(self, file_href, item_id, file_title, publishDate):

        category = os.path.splitext(file_href)[1]
        retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
                   'create_time': '', 'page_size': '', 'content': ''}
        try:
            response = self.getRes(file_href)
        except:
            self.log.error('文件获取失败')
            return retData

        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            self.log.error(f'文件解析失败')
            return retData

        file_size = int(response.headers.get('Content-Length'))
        file_name = str(self.getuuid()) + category
        try:
            result = self.sendOBS(file_name, response)
        except:
            self.log.error(f'obs上传失败')
            return retData

        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['state'] = True
            retData['file_size'] = self.convert_size(file_size)
            retData['create_time'] = time_now
        except Exception as e:
            print(f'error:{e}')
            return retData
        return retData


def paserUrl(html, listurl):
    # 获取所有的<a>标签和<img>标签
    if isinstance(html, str):
        html = BeautifulSoup(html, 'html.parser')

    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html


@retry(tries=5, delay=10)
def getSoup(url):
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    req.close()
    return soup


@retry(tries=5, delay=10)
def getImg(url):
    req = requests.get(url, headers=headers, verify=False)
    content = req.content
    return content


def getContentA(url):
    soup = getSoup(url)
    soup = paserUrl(soup, url)
    contentWithTag = soup.find('div', class_='node__content')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    img_list = contentWithTag.find_all('img')
    for img in img_list:
        src = img.get('src')
        img_title = img.get('alt') + '.jpg'
        img_title = img_title.replace('/', '-')
        content = getImg(src)
        with open(f'./img/{img_title}', 'wb') as f:
            f.write(content)
        time.sleep(3)


def getContentB(url):
    pass


@retry(tries=5, delay=10)
def getList():
    url = 'https://www.reit.com/data-research/research/nareit-research'
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    li_list = soup.find('div', class_='paragraph--text-block__inner').find_all('li')
    req.close()
    return li_list


def doJob():
    li_list = getList()
    num = 1
    for li in li_list:
        # log.info(f'开始采集')
        title = li.find('a').text.strip()
        summary = li.text.replace(title, '').replace('(PDF)', '').strip()
        href = 'https://www.reit.com' + li.find('a').get('href')
        if '.pdf' in href:
            getContentB(href)
        else:
            getContentA(href)
        time.sleep(10)


class Third_party():
    def __init__(self):
        pass

    def doJob(self,obsOperate):
        id_list = []
        origin = 'Nareit官网'
        url = 'https://www.reit.com/data-research/research/third-party-research'
        soup = getSoup(url)
        li_list = soup.find('div', class_='field--name-field-text').find_all('li')
        num = 2
        # for li in li_list:
        li = li_list[1]
        title = li.find('a').text.strip()
        href = li.find('a').get('href')
        date = li.text.split(title)[0]
        # try:
        year = re.findall('\d+', date)[0]
        # except:
            # continue
        month = enMonth[date.split(year)[0].strip()]
        publishDate = year + '-' + month + '-' + '01'
        summary = li.text.split(title)[1].strip()
        # file_title = title + ".pdf"
        soup_ = getSoup(href)
        soup_ = paserUrl(soup_,href)
        contentWithTag = soup_.find('div',class_='content')
        a_list = contentWithTag.find_all('a')
        for a in a_list:
            file_title = a.get('title')
            fj_href = a.get('href')
            retData = obsOperate.uptoOBS(fj_href, '', file_title, publishDate)
            time.sleep(2)
            if retData['state']:
                pass
            else:
                log.error(f'{title}===研报下载obs失败')
                continue
            att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, publishDate)
            num += 1
            id_list.append(att_id)
        content = contentWithTag.text.strip()
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        id = '1729021644139057153' + str(int(time.time()))
        lang = baseCore.detect_language(retData['content'])
        dic_news = {
            'id': id,
            'subjectId': '1729021644139057153',
            'checkStatus': 1,
            'deleteFlag': 0,
            'topNum': 0,
            # 'content': retData['content'],
            'content':content,
            'contentWithTag': str(contentWithTag),
            'createDate': time_now,
            'lang': lang,
            'origin': origin,
            'publishDate': publishDate,
            'sourceAddress': href,
            'title': title,
            'summary': summary,
            'attachmentIds': id_list,
            'sid': '1730477904990486529',
        }
        try:
            baseCore.sendkafka(dic_news, 'research_center_fourth')
            baseCore.r.sadd('REITs::' + webname, href)
            log.info(f'{title}===采集成功')
            num += 1
        except Exception as e:
            log.error(f'{title}===发送kafka失败==={e}')

if __name__ == '__main__':
    obsOperate = obsOperate(cursor_,cnx_,log)
    Third_party().doJob(obsOperate)
    baseCore.close()