import os
import time
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By

from base import BaseCore
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Host': 'www.szse.cn',
    'Origin': 'http://www.szse.cn',
    'Pragma': 'no-cache',
    # 'Referer': 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=["rules","csrcrules","szseBussrules","memorandumServicedirect","publicadvice","lawruleSearch"]&range=content&searchtype=0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Request-Type': 'ajax',
    'X-Requested-With': 'XMLHttpRequest',
}


def paserUrl(html, listurl):
    # 获取所有的<a>标签和<img>标签
    if isinstance(html, str):
        html = BeautifulSoup(html, 'html.parser')

    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html


def getFjContent(url):
    ip = baseCore.get_proxy()
    session = requests.session()
    session.get('http://www.szse.cn/',headers=headers,proxies=ip)
    req = session.get(url)
    req.encoding = req.apparent_encoding
    content = req.content
    session.close()
    return content


def getContent(url, publishDate, num):
    fjhref_list = ''
    fjtitle_list = ''
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    soup = paserUrl(soup, 'http://www.szse.cn/')
    contentWithTag = soup.find('div', class_='des-content')
    a_list = contentWithTag.find_all('a')
    num_ = 1
    for a in a_list:
        fj_href = a.get('href')
        if not fj_href:
            continue
        fjhref_list += fj_href + '\n'
        fj_title = a.text.lstrip().strip()
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        fj_title = f'{num}-{publishDate}-{fj_title}'
        fjcontent = getFjContent(fj_href)
        file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
        if os.path.exists(file):
            fj_title = fj_title.replace(category,f'-{num_}{category}')
            num_ += 1
        file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
        fjtitle_list += fj_title + '\n'
        with open(file, 'wb') as f:
            f.write(fjcontent)
        log.info(f'{fj_title}===附件下载成功')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    pub_hao = contentWithTag.find('p').text.lstrip().strip()
    content = contentWithTag.text.lstrip().strip()
    return pub_hao, content, fjtitle_list, fjhref_list


def doJob():
    if not os.path.exists('./相关政策/深圳证券交易所/政策文件'):
        os.makedirs('./相关政策/深圳证券交易所/政策文件')
    url = 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=%5B%22rules%22,%22csrcrules%22,%22szseBussrules%22,%22memorandumServicedirect%22,%22publicadvice%22,%22lawruleSearch%22%5D&range=content&searchtype=0'
    driver = baseCore.buildDriver()
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'article-item'))
    )
    div_list = driver.find_elements(By.CLASS_NAME, 'article-item')
    num = 0
    data_list = []
    for div in div_list:
        title = div.find_element(By.TAG_NAME, 'a').text.lstrip().strip()
        href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
        publishDate = div.find_element(By.CLASS_NAME, 'pull-right').text.lstrip().strip()
        writtenDate = publishDate
        origin = '深圳证券交易所'
        organ = origin
        if '.pdf' in href:
            content = ''
            summary = ''
            fjtitle_list = title + '.pdf'
            fjhref_list = href
            pub_hao = ''
            fjcontent = getFjContent(href)
            file = f'./相关政策/深圳证券交易所/政策文件/{title}.pdf'
            with open(file, 'wb') as f:
                f.write(fjcontent)
            log.info(f'{title}===附件下载成功')
        else:
            summary = div.find_element(By.CLASS_NAME, 'item-content').text.lstrip().strip()
            pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
                fjhref_list]
        data_list.append(data)
        log.info(f'{title}===采集成功')
        num += 1
    driver.close()
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    df.to_excel('./相关政策/深圳证券交易所/深圳证券交易所政策文件.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
