import os
import time

import requests
from bs4 import BeautifulSoup

import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'research_center_fourth'
webname = '中华人民共和国中央人民政府'
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'sousuo.www.gov.cn',
    'Pragma': 'no-cache',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    URL = 'https://www.gov.cn/'
    session = requests.session()
    session.get(URL,headers=headers,proxies=ip)
    # req = requests.get(url, headers=headers, proxies=ip)
    req = session.get(url)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    # req.close()
    session.close()
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    URL = 'https://www.gov.cn/'
    session = requests.session()
    session.get(URL,headers=headers,proxies=ip)
    req = session.get(url)
    req.encoding = req.apparent_encoding
    content = req.content
    session.close()
    return content


def getPageSize(types):
    total = 0
    ip = baseCore.get_proxy()
    url = 'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p=1&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    for type in types:
        num = int(req.json()['searchVO']['catMap'][f'{type}']['totalCount'])
        total += num
    print(total)
    if total % 20 == 0:
        pageSize = int(total / 20)
    else:
        pageSize = int(total / 20) + 1
    req.close()
    return pageSize


def getDataJson(page, types):
    data_list = []
    ip = baseCore.get_proxy()
    url = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p={page}&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    for type in types:
        data_list_ = req.json()['searchVO']['catMap'][f'{type}']['listVO']
        data_list += data_list_
    req.close()
    return data_list


def getContent(url, publishDate, num, organ, id_list):

    soup = getSoup(url)
    if organ == '':
        try:
            organ = soup.find('div', class_='pages-date').find('span', class_='font').text.split('来源：').lstrip().strip()
        except:
            organ = ''
    contentWithTag = soup.find('div', class_='TRS_UEDITOR')
    if not contentWithTag:
        contentWithTag = soup.find('div',class_='pages_content')
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_href = a.get('href')
        if '.htm' not in fj_href and '.html' not in fj_href and '.shtml' not in fj_href and '.shtm' not in fj_href:

            fj_title = a.text.lstrip().strip()
            category = os.path.splitext(fj_href)[1]
            if category not in fj_title:
                fj_title = fj_title + category

            # 上传附件至obs
            att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
            if att_id:
                id_list.append(att_id)
                a['href'] = full_path

    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = contentWithTag.text.lstrip().strip()
    return content, id_list, contentWithTag, organ


def getData(data_, num):
    id_list = []
    title = data_['title'].replace('\n', '').replace('\r', '')
    title = BeautifulSoup(title,'lxml').text
    publishDate = data_['pubtimeStr'].replace('.', '-')
    origin = '国务院'
    href = data_['url']

    writtenDate = data_['ptime']
    try:
        organ = data_['puborg']
    except:
        organ = ''
    pub_hao = data_['pcode']
    summary = data_['summary']
    summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
    content, id_list, contentWithTag, organ = getContent(href, publishDate, num, organ, id_list)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'subjectId': '1729021803283533825',
        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': str(contentWithTag),
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '1729021803283533825'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': href,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': '1729028548502597633'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, href)
        log.info(f'采集成功--{title}--{href}')
    except:
        for att_id in id_list:
            baseCore.deliteATT(att_id)

    # data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
    #         fjhref_list]


def doJob():
    # if not os.path.exists('./相关政策/国务院/政策文件'):
    #     os.makedirs('./相关政策/国务院/政策文件')
    # data_list = []
    href_list = []
    num = 1
    types = ['bumenfile', 'gongwen', 'otherfile', 'gongbao']
    pageSize = 7
    for page in range(1, pageSize + 1):
        data_json = getDataJson(page, types)
        for data_ in data_json:
            href = data_['url']
            # 根据链接判重
            is_member = baseCore.r.sismember('REITs::' + webname, href)
            if is_member:
                continue
            if href not in href_list:
                getData(data_, num)
                num += 1
                # data_list.append(data)
                href_list.append(href)
            time.sleep(3)
    # df = pd.DataFrame(np.array(data_list))
    # df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    # df.to_excel('./相关政策/国务院/国务院政策文件.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
