import requests
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    if req.url == 'https://www.hainan.gov.cn/hainan/xhtml/404.html':
        return ''
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getPageSize(type):
    url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
    soup = getSoup(url)
    total = int(soup.find('div', class_='results-list').find('span').text.lstrip().strip())
    if total % 10 == 0:
        pageSize = int(total / 10)
    else:
        pageSize = int(total / 10) + 1
    return pageSize


def getContent(url, publishDate, num):
    fjhref_list = ''
    fjtitle_list = ''
    soup = getSoup(url)
    if soup == '':
        return '','','',''
    try:
        writtenDate = soup.find('div', class_='zwgk_comr1').text.replace(' ', '').split('成文日期：')[1].split('标题')[
            0].lstrip().strip()
    except:
        writtenDate = ''
    contentWithTag = soup.find('div', attrs={'id':'font'})

    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    try:
        content = contentWithTag.text.lstrip().strip()
    except:
        print(url)
    return writtenDate, content, fjtitle_list, fjhref_list


def getData(div, num):
    title = div.find('a', class_='titlec').get('title').replace('\n', '').replace('\r', '').lstrip().strip()
    href = div.find('a', class_='titlec').get('href')
    publishDate = div.find('span', class_='quily-con').text.lstrip().strip()
    origin = div.find('a', class_='address-con').text.lstrip().strip()
    try:
        table = div.find('div', class_='search-results').find('table').text
        organ = table.split('发文机关：')[1].split('文号：')[0].lstrip().strip()
        pub_hao = table.split('文号：')[1].lstrip().strip()
    except:
        organ = ''
        pub_hao = ''
    try:
        summary = div.find('p', class_='p-text-color').text.lstrip().strip()
    except:
        summary = ''
    writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
    if content == '':
        return []
    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
            fjhref_list]
    return data


def doJob():
    if not os.path.exists('./相关政策/海南省人民政府/政策文件'):
        os.makedirs('./相关政策/海南省人民政府/政策文件')
    data_list = []
    href_list = []
    num = 1
    types = [2682,2677]
    for type in types:
        pageSize = getPageSize(type)
        for page in range(pageSize):
            url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={page}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
            soup = getSoup(url)
            div_list = soup.select('#showPage > div')
            del (div_list[-1])
            for div in div_list:
                href = div.find('a', class_='titlec').get('href')
                if href not in href_list:
                    data = getData(div, num)
                    if data:
                        href_list.append(href)
                        data_list.append(data)
                        num += 1
                        log.info(f'{data[1]}===采集成功')
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    df.to_excel('./相关政策/海南省人民政府/江西省人民政府政策文件.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
