import json
import os
import time
from random import choice
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 国务院文件
def get_content1():

    def getPageConunt(a_list, url, headers, s):
        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
                "trackTotalHits": "true",
                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
                "pageSize": 20, "pageNo": 1}
        data = json.dumps(data)
        ip = baseCore.get_proxy()
        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
        # 获得结果为json格式
        res_text = json.loads(res.text)
        pageCount = res_text['result']['data']['pager']['pageCount']
        return pageCount

    def getList(a_list, url, headers, pageNo, s):
        # post请求所需参数
        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
                "trackTotalHits": "true",
                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
                "pageSize": 20, "pageNo": pageNo}
        data = json.dumps(data)
        ip = baseCore.get_proxy()
        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
        res_text = json.loads(res.text)
        page_list = res_text['result']['data']['list']
        return page_list

    start_time = time.time()
    num = 0
    # 过网站验证所需  athenaAppKey  athenaAppName
    athenaAppKeys = [
        'ZfaiEpAY%2B%2FYj5RjJrDfj2dn%2BcS4WRoxcLidI68z5l6SH8WE8CXSVP7QkJNIhy%2Bng4mZcwCuOFKNUvj%2FH6mR7sx0sCwgkIAfq4XNfHY6Fy7fxQ0NWm%2Fx7rmB5ow5OPdW5NMdI2RzURAFCVA9aIV4a1W8TFOjbQWYOukxUFtVJibU%3D',
        'XU4ULRMBYHbE0I2fNNgCBxYTAg5Dk%2FPUEN9XeKy4OOAPdcZ6DW%2BrsopeI1gUwPmq3Y%2FtZJhH3NXiXqH5RxBAoYO231FHjPaMxD6QaMlA2BUeOFxmGKnuvJnIby1k6RmrCFd6IoXSImm5RFgcVed%2FvL6Qie2o6BAkRaEUHAitK18%3D',
        'IFrNB5NkaDApRpF09SoT4fVBUoi7gRF2prj4EHk8eIVSEc1yYPpAZWVDMnqc2lVmeaQcNvrvZffA2kPVdvqjUHV5lGPJccRK3epnJ5Xx3xwIfTG7iIgrjFlqK1I93E0SIP6wyZJu42ksnF3nJdZ31sLEDCBeLi3pkggFtIEIQsg%3D',
        'Nkgtgnyd%2B6jfdlclssI8FB9xRTQDdWzreONdqvta2aKZMRlhWoHhdj6L%2BQRyD8InaLWJC1zCSOkIy5b%2BjjZTg80t2jPu%2F1ifcRnboIj8%2BDIYWNSxMu%2Fdxze7oPtPo6sR08%2B3tQOE3ZntyFsGT44vCpa6DgK8ee3C5S58lanYXuI%3D',
        'Zcko%2F7%2F2f2EuUmKpXbWnK3JtZtVy4trUNyE2JA5jVIw2r1oxTXVNZy8KQDmnOPDfyazdOrH6VYaJWloE4MukMK4VloB%2BRy6QhEaUvm%2Fsp4Enzl7doEk%2B1sZ1Y2iUd5REIhJQ%2Bp%2BB5iJEeNTmlQuRzYU3kOjDYtXftuehRTNKiXk%3D',
        'Y2guFVvdtqMPhx5s9xThqdkvbe5hPaTlV7BYhcDuK7l%2BaXUqUMUHdim3uzn9IRlbHUtOLmRk6tfPEFM%2B8vzGDvI8U48acQ8Ff6MsfOGxShrQ7kW4tr4NaoE1sBW3PNkWj1Z0K6JzSXmAS2C1zVchTUYzTlfk62ghIeDtIPsPa6s%3D',
        'a1drgLsStJotfBqHp1cFQg4lTJMMbgkTjVgCv34uy4Q%2BQ86DNEdc%2Fst0dZTUWFttuyXKNIH8%2FPYSSk465lXIn4wfuG4GuZLUk6wQo5PHNCUP0%2FvIL63IUxT0DCMo7lbsPq0ncdh4aiVswJe%2F6LM9U1m9OoaNGbeIUOl%2FxIOrMnE%3D',
        'SGwQuPLZq2UzfaBPSwcR8DGZa4Ckh3Amp%2Bc1tMBFsMp%2Fh7Qn%2B9nspxdI3CW9S5LlfxYQmfa%2F%2B%2BJdH%2BBnxt0ILiCA4o9TUOxx27MhN9b4CLZnD8ZJ6sOwMszdFToDAD7hE21%2FzCzxhPNzPbyMXPpeMdi6sY0O2Sd85PLDtlZv%2FYQ%3D',
        'CbvDEoIrP1%2BgMOuRJFhJNUGhzHBGnwdI6lIVG1ns1ZaLTlGRXLRgMjh9nBwLGMLTZwlPskklMbygvfA4P5UGhGT%2FpqKFkZne%2FAzTK8U6oJMo5%2FNAczbHhKwG7gdepIiiI7CgeNDtP8kurkcxnVS2KA1CLo8CVzmMlLHRmMPI8ag%3D']
    athenaAppKey = choice(athenaAppKeys)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'athenaAppKey': f"{athenaAppKey}",
        'athenaAppName': "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
        'Content-Type': 'application/json;charset=UTF-8',
    }
    headers_ = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    }

    requests.adapters.DEFAULT_RETRIES = 5

    url = 'https://sousuoht.www.gov.cn/athena/forward/486B5ABFBAD0FF5743F5E82E007EF04DDD6388E7989E9EC9CC7B84917AC81A5F'
    result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
                   ['国办函', "1103"],
                   ['国办发明电', "1102"], ['其他', "1101"]]
    for a_list in result_list:
        s = requests.session()
        s.mount('https://', HTTPAdapter(max_retries=3))
        s.mount('http://', HTTPAdapter(max_retries=3))
        s.keep_alive = False
        pcodeJiguan = a_list[0]
        try:
            # pageCount = getPageConunt(a_list, url, headers, s)
            # for pageNo in range(1, pageCount + 1):
            pageNo = 1
            try:
                try:
                    page_list = getList(a_list, url, headers, pageNo, s)
                except:
                    s.close()
                    page_list = getList(a_list, url, headers, pageNo, s)
                for page in page_list:
                    id_list = []
                    # 获取所需信息
                    title = page['maintitle']  # 标题
                    pub_time1 = page['publish_time']  # 发布时间
                    pub_time2 = page['cwrq']  # 成文时间
                    pub_code = page['fwzh']  # 发文字号
                    href = page['pub_url']  # 网址
                    # 判断是否已经爬取过
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        log.info('已采集----------跳过')
                        time.sleep(0.5)
                        continue
                    try:
                        resp_href = requests.get(url=href, headers=headers_, verify=False)
                        resp_href.encoding = resp_href.apparent_encoding
                        i_html = resp_href.text
                        if '您访问的页面不存在或已删除' in i_html:
                            # log.error(f'{title}...{href}...页面不存在或已删除')
                            continue
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                        i_soup = baseTool.paserUrl(i_soup, href)
                        source = str(i_soup.find_all('tbody')[0])
                        pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[
                            0]  # 发文机关
                        child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]  # 主题分类
                        contentWithTag = i_soup.find('div', class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',
                                                                                                          class_='border-table noneBorder pages_content')
                        try:
                            # 去除扫一扫
                            contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
                        except:
                            pass
                        content = contentWithTag.text  # 不带标签正文
                        fu_jian_soup = contentWithTag.find_all('a')
                        time.sleep(0.5)
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                log.info(f'---{href}--------{e}-------')
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1766', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', file_name, num, pub_time1)
                                id_list.append(att_id)

                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                    except:
                        log.error(f'{title}...{href}...获取内容失败')
                        continue
                    # todo:替换完成之后，将附件上传至文件服务器
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,  # 附件id
                        'author': '',  # 作者
                        'content': content,  # 正文不带标签
                        'contentWithTag': str(contentWithTag),  # 正文带标签
                        'createDate': time_now,  # 创建时间
                        'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
                        'id': '',  #
                        'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}],
                        # 关联标签id  关联标签名称  关联标签标识
                        'origin': '中华人民共和国中央人民政府',  # 政策发布机关
                        'organ': pub_org,  # 政策发文机关
                        'topicClassification': child_type,  # 政策文件分类
                        'issuedNumber': pub_code,  # 发文字号
                        'publishDate': pub_time1,  # 发布时间
                        'writtenDate': pub_time2,  # 成文时间
                        'sid': '1697458829758697473',  # 信息源id
                        'sourceAddress': href,  # 原文链接
                        'summary': '',  # 摘要
                        'title': title  # 标题
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                    num += 1
            except:
                log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
                continue
        except:
            log.error(f'{pcodeJiguan}...获取总数失败')
            continue
    end_time = time.time()
    log.info(f'共抓取国务院文件{num}条数据，共耗时{end_time - start_time}')

if __name__ == "__main__":
    get_content1()