import os
import re
import time
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 国务院国有资产监督管理委员会-政策发布
def get_content3():
    pathType = 'policy/gyzc/'

    def getPage():
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
        req = requests.get(url, headers=baseTool.headers, verify=False)
        req.encoding = req.apparent_encoding
        soup = BeautifulSoup(req.text, 'html.parser')
        # totalpage = re.findall("总页数:(.*)", soup.select('#pag_2603340')[0].text)[0]
        totalpage = '17'
        return int(totalpage)

    def sendContent(href, headers, title, pub_time, num):
        id_list = []
        resp_href = requests.request("GET", href, headers=headers, verify=False)
        resp_href.encoding = resp_href.apparent_encoding
        soup = BeautifulSoup(resp_href.text, 'lxml')
        soup = baseTool.paserUrl(soup, href)
        doc_href = soup.find('div', class_='zsy_content')
        try:
            org_content = doc_href.select('.zsy_cotitle')[0]
            org = re.findall('文章来源：(.*?)发布时间：', str(org_content))[0].strip()
        except Exception as e:
            org = ''
        try:
            contentWithTag = doc_href.find('div', class_='zsy_comain')
        except:
            return
        contentWithTag.select('#qr_container')[0].decompose()
        contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
        contentWithTag.find('div', class_='related').decompose()
        contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
        try:
            p_list = contentWithTag.findAll('p')
            pub_hao = ''
            for p in p_list:
                p = str(p.text)
                if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
                    try:
                        pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
                    except:
                        pub_hao = p.strip().lstrip()
                    break
        except:
            pub_hao = ''
        if len(pub_hao) > 15:
            pub_hao = ''
        content = contentWithTag.text
        if content == '' or content == 'None':
            log.info(f'----{href}----{title}----内容为空----')
            return
        fu_jian_soup = contentWithTag.find_all('a')
        for file in fu_jian_soup:
            try:
                file_href = file['href']
            except Exception as e:
                log.info(f'---{href}--------{e}-------')
                continue
            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                file_name = file.text.strip()
                category = os.path.splitext(file_href)[1]
                if category not in file_name:
                    file_name = file_name + category
                retData = baseCore.uptoOBS(file_href, '1642', file_name)
                if retData['state']:
                    pass
                else:
                    continue
                try:
                    att_id, full_path = baseCore.tableUpdate(retData, '国务院国资委', file_name, num, pub_time)
                    id_list.append(att_id)
                except:
                    continue

                # todo:将返回的地址更新到soup
                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # todo:传kafka字段
        dic_news = {
            'attachmentIds': id_list,  # 附件id
            'author': '',  # 作者
            'content': content,  # 正文不带标签
            'contentWithTag': str(contentWithTag),  # 正文带标签
            'createDate': time_now,  # 创建时间
            'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
            'id': '',  #
            'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
            # 关联标签id  关联标签名称  关联标签标识
            'origin': org,  # 政策发布机关
            'organ': org,  # 政策发文机关
            'topicClassification': '',  # 政策文件分类
            'issuedNumber': pub_hao,  # 发文字号
            'publishDate': pub_time,  # 发布时间
            'writtenDate': None,  # 成文时间
            'sid': '1697458829758697473',  # 信息源id
            'sourceAddress': href,  # 原文链接
            'summary': '',  # 摘要
            'title': title  # 标题
        }
        # log.info(title)
        flag = baseTool.sendKafka(dic_news)
        if flag:
            baseTool.save_data(dic_news)

    def partTwo():
        start_time = time.time()
        num = 0
        count = 0
        totalpage = getPage()
        for page in range(1, totalpage):
            url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
            href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
            resp_text = href_resp.content.decode('UTF-8')
            li_list = resp_text.split('<li>')
            del (li_list[0])
            for li in li_list:
                id_list = []
                href_ = li.split('<a href="')[1].split('" target=')[0]
                title = li.split('title="')[1].split('">')[0]
                href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
                pub_time = li.split('<span>[')[1].split(']</span>')[0]
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    log.info('已采集----------跳过')
                    continue
                sendContent(href, baseTool.headers, title, pub_time, num)
                num += 1
                count += 1
        end_time = time.time()
        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')

    def partOne():
        start_time = time.time()
        num = 0
        count = 0
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
        try:
            # get请求,需要取消ssl验证
            href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
            resp_text = href_resp.content.decode('UTF-8')
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.zsy_conlist li').items()
            time.sleep(1)
            for doc_item in doc_items:
                # 获取所需数据
                try:
                    href_ = doc_item('a').attr('href')
                    if href_ is None:
                        continue
                    href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
                    # 判断是否已经爬取过
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        log.info('已采集----------跳过')
                        continue
                    title = doc_item('a').attr('title')
                    pub_time = doc_item('span').text().replace('[', '').replace(']', '')
                except:
                    continue
                sendContent(href, baseTool.headers, title, pub_time, num)
                num += 1
                count += 1
        except:
            pass
        end_time = time.time()
        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')

    partOne()
    # 增量执行需要注释掉partTwo()
    # partTwo()


if __name__ == "__main__":
    get_content3()