import datetime
import json
import time

import requests
from bs4 import BeautifulSoup
from retry import retry

from base import BaseCore
import os
import pandas as pd
import numpy as np

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'Content-Type': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'X-XSRF-TOKEN': 'eyJpdiI6InhWUlhvRWpuUUp4ejFsQ0VVb29CaFE9PSIsInZhbHVlIjoiOUp5dHJ2SVVoNWl0K0s3UVlaZGZcL3p0a0gxc09sclRVU2JZTjg3dVUyTER4WVE4Qm1Ta2dyWUJndENmMURYVmwiLCJtYWMiOiJjNGU5YTU1MTJmZmZmZjdhZjRkNDE0NmM4Y2I3OTNkMmExYmJjZGRmYTk5MGMyMmQyM2FhYjVjMjRhZTY0NjA2In0=',
}


@retry(tries=5, delay=5)
def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    time.sleep(5)
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getPageSize():
    ip = baseCore.get_proxy()
    url = 'https://search.gd.gov.cn/api/search/file'
    data_post = {"page": "1", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2", "range": "site",
                 "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
    data_post = json.dumps(data_post)
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    total = int(req.json()['data']['total'])
    if total % 12 == 0:
        pageSize = int(total / 12)
    else:
        pageSize = int(total / 12) + 1
    return pageSize


def getDataJson(url, data_post):
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    try:
        data_json = req.json()['data']['list']
    except:
        data_json = req.json()['data']['news']['list']
    return data_json


def getContent(url, publishDate, num):
    fjhref_list = ''
    fjtitle_list = ''
    soup = getSoup(url)
    time.sleep(2)
    try:
        try:
            contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0]
        except:
            contentWithTag = soup.select('body > div.con > div:nth-of-type(3) > div.content > div.viewList > div.zw')[0]
    except:
        contentWithTag = soup.find('div', class_='article-content').find('center')
        if not contentWithTag:
            contentWithTag = soup.find('div', class_='article-content')
    img_list = contentWithTag.find_all('img')
    num_ = 1
    for img in img_list:
        fj_href = img.get('src')
        if "http" not in fj_href and '//www' in fj_href:
            fj_href = 'http:' + fj_href
        fjhref_list += fj_href + '\n'
        fj_title = img.get('alt')
        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        fj_title = f'{num}-{publishDate}-{fj_title}'
        fjcontent = getFjContent(fj_href)
        file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
        if os.path.exists(file):
            file = file.replace(category, f'-{num_}{category}')
            num_ += 1
        if os.path.exists(file):
            fj_title = fj_title.replace(category, f'-{num_}{category}')
            file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
        fjtitle_list += fj_title + '\n'
        with open(file, 'wb') as f:
            f.write(fjcontent)
        log.info(f'{fj_title}===附件下载成功')
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_href = a.get('href')
        fjhref_list += fj_href + '\n'
        fj_title = a.text.lstrip().strip()
        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        fj_title = f'{num}-{publishDate}-{fj_title}'
        fjcontent = getFjContent(fj_href)
        file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
        if os.path.exists(file):
            file = file.replace(category, f'-{num_}{category}')
            num_ += 1
        fjtitle_list += fj_title + '\n'
        with open(file, 'wb') as f:
            f.write(fjcontent)
        log.info(f'{fj_title}===附件下载成功')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = contentWithTag.text.lstrip().strip()
    fjtitle_list = fjtitle_list.lstrip().strip()
    fjhref_list = fjhref_list.lstrip().strip()
    return content, fjtitle_list, fjhref_list


def ST(txt):
    txt = BeautifulSoup(txt, 'lxml').text
    return txt


def getData(data_, num):
    title = ST(data_['title'])
    log.info(f'{title}===开始采集')
    publishDate = data_['pub_time']
    origin = data_['publisher_src']
    href = data_['url']
    log.info(href)
    writtenDate = data_['date']
    if writtenDate:
        writtenDate = datetime.datetime.fromtimestamp(writtenDate).strftime('%Y-%m-%d')
    organ = data_['source']
    pub_hao = data_['document_number']
    summary = ST(data_['content'])
    content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
            fjhref_list]
    return data


def doJob_1():
    if not os.path.exists('./相关政策/广东省人民政府/政策文件'):
        os.makedirs('./相关政策/广东省人民政府/政策文件')
    pageSize = getPageSize()
    data_list = []
    num = 1
    url = 'https://search.gd.gov.cn/api/search/file'
    for page in range(1, pageSize + 1):
        data_post = {"page": f"{page}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
                     "range": "site",
                     "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
        data_post = json.dumps(data_post)
        data_json = getDataJson(url, data_post)
        for data_ in data_json:
            data = getData(data_, num)
            data_list.append(data)
            log.info(f'{data[1]}===采集成功')
            num += 1
    return data_list, num


def doJob_2(num):
    url = 'https://search.gd.gov.cn/api/search/all'
    types = ['政策解读', '计划规划']
    data_list = []
    for type in types:
        data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
                     "range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000",
                     "service_area": 1}
        data_post = json.dumps(data_post)
        data_json = getDataJson(url, data_post)
        for data_ in data_json:
            data = getData(data_, num)
            time.sleep(1)
            data_list.append(data)
            log.info(f'{data[1]}===采集成功')
            num += 1
    return data_list


def doJob():
    data_list = []
    data_list_, num = doJob_1()
    data_list += data_list_
    data_list_ = doJob_2(num)
    data_list += data_list_
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    df.to_excel('./相关政策/广东省人民政府/广东省人民政府政策文件.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    # doJob_1()
    # doJob_2(2)
    # url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
    # soup = getSoup(url)
    #
    # print(contentWithTag)
    baseCore.close()
