import datetime
import time
import urllib.parse

import requests
from ClassTool import ClassTool
from BaseCore import BaseCore

baseTool = ClassTool()
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Connection': 'keep-alive',
    'Host': 'flk.npc.gov.cn',
    'Referer': 'https://flk.npc.gov.cn/fl.html',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}


def getDataJson(url):
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    datasJson = req.json()['result']['data']
    totalSizes = req.json()['result']['totalSizes']
    req.close()
    return datasJson, totalSizes


def getPdf(id_, title, publishDate):
    id_list = []
    url = 'https://flk.npc.gov.cn/api/detail'
    payload = {'id': id_}
    req = requests.post(url, headers=headers, data=payload)
    req.encoding = req.apparent_encoding
    datasJson = req.json()['result']['body']
    req.close()
    href = ''
    for dataJson in datasJson:
        if dataJson['type'] == 'WORD':
            href = 'https://wb.flk.npc.gov.cn' + dataJson['path']
            break
    if not href:
        log.error(f'{title}===附件链接获取失败')
        return ''
    retData = baseCore.uptoOBS(href, '1699', title)
    if retData['state']:
        pass
    else:
        return ''
    att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', title, 0, publishDate)
    id_list.append(att_id)
    return id_list


def getDic(title, office, publishDate, expiry, type, timeliness, href, id_):
    id_list = getPdf(id_, title, publishDate)
    if not id_list:
        log.error(f'{title}===附件下载失败')
        return ''
    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    dic_news = {
        'attachmentIds': id_list,  # 附件id
        'author': '',  # 作者
        'content': title,  # 正文不带标签
        'contentWithTag': '',  # 正文带标签
        'createDate': now,  # 创建时间
        'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
        'id': '',  #
        'labels': [{'relationId': "1788847783801794562", 'relationName': "国资国企法律法规", 'labelMark': "policy"}],
        # 关联标签id  关联标签名称  关联标签标识
        'origin': '',  # 政策发布机关
        'organ': office,  # 政策发文机关、制定机关
        'topicClassification': '',  # 政策文件分类
        'issuedNumber': '',  # 发文字号
        'publishDate': publishDate,  # 政策发布时间、法律公布日期
        'writtenDate': None,  # 成文时间
        'implementDate': expiry,  # 施行日期
        'sid': '1788838266435284993',  # 信息源id
        'sourceAddress': href,  # 原文链接
        'summary': '',  # 摘要
        'title': title,  # 标题
        'legalPrecedenceHierarchy': type,  # 法律效力位阶
        'effectiveness': timeliness,  # 实效性
    }
    return dic_news


def doJob():
    searchList = ['国有资产', '国资', '国有企业', '企业', '公司']
    for search in searchList:
        search_ = urllib.parse.quote(search)
        url = f'https://flk.npc.gov.cn/api/?type=&fgbt={search_}&searchType=title%3Baccurate%3B1&sortTr=f_bbrq_s%3Bdesc&gbrqStart=&gbrqEnd=&sxrqStart=&sxrqEnd=&page=1&size=10'
        datasJson, totalSizes = getDataJson(url)
        if totalSizes % 10 == 0:
            totalPage = totalSizes / 10
        else:
            totalPage = totalSizes // 10 + 1
        for page in range(1, totalPage + 1):
            if page != 1:
                url = url.replace(f'&page={page - 1}', f'&page={page}')
                datasJson, totalSizes = getDataJson(url)
            for dataJson in datasJson:
                id_ = dataJson['id']
                title = dataJson['title']
                office = dataJson['office']
                publishDate = dataJson['publish']
                expiry = dataJson['expiry']
                type = dataJson['type']
                status = dataJson['status']
                if status == '1':
                    timeliness = '有效'
                elif status == '5':
                    timeliness = '已修改'
                elif status == '9':
                    timeliness = '已废止'
                elif status == '3':
                    timeliness = '尚未生效'
                href = dataJson['url'].replace('./', 'https://flk.npc.gov.cn/')
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    log.info(f'{title}===已采集')
                    continue
                dic = getDic(title, office, publishDate, expiry, type, timeliness, href, id_)
                if dic:
                    flag = baseTool.sendKafka(dic)
                    if flag:
                        baseTool.save_data(dic)
                else:
                    log.error(f'{title}==={href}===获取失败')
                time.sleep(2)



if __name__ == '__main__':
    doJob()
