import re

import fitz
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from base import BaseCore
from requests.models import Response
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache',
    'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
    'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getPageSize():
    # url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
    url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=1'
    soup = getSoup(url)
    total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
    if total % 4 == 0:
        pageSize = int(total / 4)
    else:
        pageSize = int(total / 4) + 1
    return pageSize


def getContent(url):
    content = ''
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    # req.encoding = req.apparent_encoding
    with fitz.open(stream=req.content, filetype='pdf') as doc:
        page_size = doc.page_count
        for page in doc.pages():
            content += page.get_text()
    return content


def doJob():
    # if not os.path.exists('./研究咨询/戴德梁兴/行业视角-研究报告'):
    #     os.makedirs('./研究咨询/戴德梁兴/行业视角-研究报告')
    num = 1
    data_list = []
    pageSize = getPageSize()
    for page in range(1, pageSize + 1):
        # url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'

        url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=1'
        soup = getSoup(url)
        div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
        for div in div_list:
            fjtitle_list = ''
            fjhref_list = ''
            name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
            summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
            href = div.find('a', class_='zhuanyuan_xinxi').get('href')
            origin = '戴德梁兴'
            try:
                content = getContent(href)
                # print(content)
            except Exception as e:
                log.error(f'第{page}页==={name}===连接失败')
                continue
            title = name.replace('/',' ').replace('|',' ').replace('？',' ').replace('"','”')
            file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title}.pdf'
            num_ = 2
            while True:
                flg = os.path.isfile(file)
                if flg:
                    log.info(f'{name}===有重名')
                    title_ = f'{title}-{num_}'
                    file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title_}.pdf'
                    num_ += 1
                else:
                    try:
                        title = title_
                    except:
                        pass
                    break
            try:
                with open(file, 'wb') as f:
                    f.write(content)
                log.info(f'{name}===成功')
                fjtitle_list += title + '\n'
                fjhref_list += href + '\n'
                data = [num, name, origin, href, summary, fjtitle_list, fjhref_list]
                data_list.append(data)
            except:
                log.error(f'第{page}页==={name}===保存失败')
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '来源', '原文链接', '摘要', '附件名称', '附件连接']
    df.to_excel('./研究咨询/戴德梁兴/行业视角-研究报告.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
