import datetime
import json
import time

import requests
from bs4 import BeautifulSoup
from retry import retry

import os
import pandas as pd
import numpy as np

import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'policy'
webname = '广东省人民政府'
headers = {
    'Content-Type': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'X-XSRF-TOKEN': 'eyJpdiI6InhWUlhvRWpuUUp4ejFsQ0VVb29CaFE9PSIsInZhbHVlIjoiOUp5dHJ2SVVoNWl0K0s3UVlaZGZcL3p0a0gxc09sclRVU2JZTjg3dVUyTER4WVE4Qm1Ta2dyWUJndENmMURYVmwiLCJtYWMiOiJjNGU5YTU1MTJmZmZmZjdhZjRkNDE0NmM4Y2I3OTNkMmExYmJjZGRmYTk5MGMyMmQyM2FhYjVjMjRhZTY0NjA2In0=',
}


@retry(tries=5, delay=5)
def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    time.sleep(5)
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getPageSize():
    ip = baseCore.get_proxy()
    url = 'https://search.gd.gov.cn/api/search/file'
    data_post = {"page": "1", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2", "range": "site",
                 "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
    data_post = json.dumps(data_post)
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    total = int(req.json()['data']['total'])
    if total % 12 == 0:
        pageSize = int(total / 12)
    else:
        pageSize = int(total / 12) + 1
    return pageSize


def getDataJson(url, data_post):
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    try:
        data_json = req.json()['data']['list']
    except:
        data_json = req.json()['data']['news']['list']
    return data_json


def getContent(url, publishDate, num):
    id_list = []
    soup = getSoup(url)
    time.sleep(2)
    policy.paserUrl(soup, url)
    try:
        try:
            contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0]
        except:
            contentWithTag = soup.select('body > div.con > div:nth-of-type(3) > div.content > div.viewList > div.zw')[0]
    except:
        contentWithTag = soup.find('div', class_='article-content').find('center')
        if not contentWithTag:
            contentWithTag = soup.find('div', class_='article-content')

    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_href = a.get('href')

        fj_title = a.text.lstrip().strip()
        if fj_title == '':
            fj_title = str(num)
            num += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        if att_id:
            id_list.append(att_id)
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = contentWithTag.text.lstrip().strip()

    return content, contentWithTag, id_list


def ST(txt):
    txt = BeautifulSoup(txt, 'lxml').text
    return txt


def getData(data_, num,sid):
    title = ST(data_['title'])
    log.info(f'{title}===开始采集')
    publishDate = data_['pub_time']
    origin = data_['publisher_src']
    href = data_['url']

    # 根据链接判重
    is_member = baseCore.r.sismember('REITs::' + webname, href)
    if is_member:
        return
    log.info(href)
    writtenDate = data_['date']
    if writtenDate:
        writtenDate = datetime.datetime.fromtimestamp(writtenDate).strftime('%Y-%m-%d')
    organ = data_['source']
    pub_hao = data_['document_number']
    summary = ST(data_['content'])
    content, contentWithTag, id_list = getContent(href, publishDate, num)
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_info = {
        'attachmentIds': id_list,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'id': '',
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': href,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': sid,
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, href)
        log.info(f'采集成功--{title}--{href}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


# 政策文件
def doJob_1(sid1):
    # if not os.path.exists('./相关政策/广东省人民政府/政策文件'):
    #     os.makedirs('./相关政策/广东省人民政府/政策文件')
    pageSize = getPageSize()

    num = 1
    url = 'https://search.gd.gov.cn/api/search/file'
    for page in range(1, pageSize + 1):
        data_post = {"page": f"{page}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
                     "range": "site",
                     "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
        data_post = json.dumps(data_post)
        data_json = getDataJson(url, data_post)
        for data_ in data_json:
            getData(data_, num, sid1)
            num += 1
    return


def doJob_2(sid2):
    url = 'https://search.gd.gov.cn/api/search/all'
    types = ['政策解读', '计划规划']
    num = 1
    for type in types:
        data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
                     "range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000",
                     "service_area": 1}
        data_post = json.dumps(data_post)
        data_json = getDataJson(url, data_post)
        for data_ in data_json:
            getData(data_, num, sid2)
            time.sleep(1)
            num += 1
    return


def doJob():
    sid1 = '1729044231736971266'
    sid2 = '1729044396395048961'
    doJob_1(sid1)

    doJob_2(sid2)



if __name__ == '__main__':
    doJob()
    # doJob_1()
    # doJob_2(2)
    # url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
    # soup = getSoup(url)
    #
    # print(contentWithTag)
    baseCore.close()
