import os
import re
import time
import datetime

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By

from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


@retry(tries=3, delay=5)
def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getContent(url, publishDate, num):
    fjhref_list = ''
    fjtitle_list = ''
    soup = getSoup(url)
    contentWithTag = soup.find('div', class_='content')
    if not contentWithTag:
        contentWithTag = soup.find('div', class_='TRS_UEDITOR')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = contentWithTag.text.lstrip().strip()
    num_ = 1
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_title = a.text.lstrip().strip()
        fj_href = a.get('href')
        if 'http' not in fj_href:
            fj_href = 'https://www.yn.gov.cn' + fj_href
        fjhref_list += fj_href + '\n'
        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        fj_title = f'{num}-{publishDate}-{fj_title}'
        fjtitle_list += fj_title + '\n'
        fjcontent = getFjContent(fj_href)
        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
        if os.path.exists(file):
            fj_title = fj_title.replace(category, f'-{num_}{category}')
            num_ += 1
        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
        with open(file, 'wb') as f:
            f.write(fjcontent)
            log.info(f'{fj_title}===附件下载成功')
    try:
        a_list = soup.find('ul', class_='apfile').find_all('a')
        for a in a_list:
            fj_title = a.text.lstrip().strip()
            fj_href = a.get('href')
            if 'http' not in fj_href:
                fj_href = 'https://www.yn.gov.cn' + fj_href
            fjhref_list += fj_href + '\n'
            if fj_title == '':
                fj_title = str(num_)
                num_ += 1
            category = os.path.splitext(fj_href)[1]
            if category not in fj_title:
                fj_title = fj_title + category
            fj_title = f'{num}-{publishDate}-{fj_title}'
            fjtitle_list += fj_title + '\n'
            fjcontent = getFjContent(fj_href)
            file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
            if os.path.exists(file):
                fj_title = fj_title.replace(category, f'-{num_}{category}')
                num_ += 1
            file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
            with open(file, 'wb') as f:
                f.write(fjcontent)
                log.info(f'{fj_title}===附件下载成功')
    except:
        pass
    return content, fjtitle_list, fjhref_list


def getData(div, num):
    pattern = r"\d{4}-\d{2}-\d{2}"
    title = div.find_element(By.CLASS_NAME, 'title').find_element(By.CLASS_NAME, 'fontlan').get_attribute(
        'title').lstrip().strip()
    href = div.find_element(By.CLASS_NAME, 'fontlan').get_attribute('href')
    origin = '云南省人民政府'
    try:
        publishDate = re.findall(pattern, div.find_element(By.CLASS_NAME, 'content').text)[0]
    except:
        publishDate = ''
    try:
        organ = \
            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
                                                                                                         'p')[
                1].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
        pub_hao = \
            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
                                                                                                         'p')[
                0].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
        if pub_hao == '无':
            pub_hao = ''
    except:
        organ = ''
        pub_hao = ''
    summary = ''
    writtenDate = ''
    if '.pdf' in href or '.PDF' in href:
        content = ''
        fjhref_list = href
        fj_title = title + '.pdf'
        fjcontent = getFjContent(fjhref_list)
        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
        with open(file, 'wb') as f:
            f.write(fjcontent)
            log.info(f'{fj_title}===附件下载成功')
        fjtitle_list = fj_title
    else:
        content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
            fjhref_list]
    return data


def doJob():
    if not os.path.exists('./相关政策/云南省人民政府/政策文件'):
        os.makedirs('./相关政策/云南省人民政府/政策文件')
    data_list = []
    url = 'https://sheng.so-gov.cn/s?siteCode=5300000033&qt=REITs'
    driver = baseCore.buildDriver()
    driver.get(url)
    time.sleep(2)
    num = 1
    for type in range(3, 5):
        driver.find_elements(By.XPATH, '/html/body/div/div[6]/div[2]/div[3]/ul/li')[type].click()
        time.sleep(2)
        if type == 3:
            driver.find_element(By.ID, 'key_place_context_id').click()
            time.sleep(2)
        try:
            total = int(driver.find_element(By.CLASS_NAME, 'pagination').find_elements(By.TAG_NAME, 'a')[-2].text)
        except:
            total = 1
        for page in range(total):
            time.sleep(2)
            div_list = driver.find_elements(By.XPATH, '//*[@id="results"]/div')
            for div in div_list:
                data = getData(div, num)
                data_list.append(data)
                log.info(f'{data[1]}===采集成功')
                num += 1
            try:
                driver.find_element(By.CLASS_NAME, 'pagination').find_element(By.CLASS_NAME, 'next').click()
            except:
                pass
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    df.to_excel('./相关政策/云南省人民政府/云南省人民政府政策文件.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
