import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 云南
def yun_nan():
    pathType = 'policy/yunnan/'

    def yun_nan1():
        """
        http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml  9
        http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml  1
        """
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
                # url = 'http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_gkgz.shtml'
                url = 'http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml'
            else:
                url = f'http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz_{page}.shtml'
            try:
                resp = requests.get(url=url, headers=baseTool.headers, verify=False)
                doc_resp = pq(resp.content)
                doc_items = doc_resp('.gkgz_list_content li').items()
                for doc_item in doc_items:
                    id_list = []
                    href = doc_item('a').attr('href')
                    if 'http:' not in href:
                        href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        fu_jian_href_list = []
                        # print(href)
                        if '.shtml' in href:
                            href_resp = requests.get(url=href, headers=baseTool.headers, verify=False)
                            href_resp.encoding = href_resp.apparent_encoding
                            href_text = href_resp.text
                            doc_href = BeautifulSoup(href_text, 'html.parser')
                            # 相对路径转化为绝对路径
                            doc_href = baseTool.paserUrl(doc_href, href)
                            title = doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > h2')[
                                0].text.lstrip().strip()
                            pub_hao = \
                                str(doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > p')[
                                        0].text).split('(')[1].split(')')[0].replace('\n', '')
                            contentwithTag = \
                                doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
                            content = contentwithTag.text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                            fu_jian_list = contentwithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
                                    fu_jian_href = fu_jian['href']
                                except:
                                    continue
                                file_name = fu_jian.text
                                if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    try:
                                        category = os.path.splitext(fu_jian_href)[1]
                                        if category not in file_name:
                                            file_name = file_name + category
                                        # 附件上传至文件服务器
                                        retData = baseCore.uptoOBS(fu_jian_href, '1679', file_name)
                                        if retData['state']:
                                            pass
                                        else:
                                            continue

                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num, '')
                                        id_list.append(att_id)
                                        # 将附件链接替换
                                        fu_jian['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                                    except:
                                        continue
                            href_resp.close()
                        elif 'display' in href:
                            continue
                        else:
                            content = ''
                            pub_hao = ''
                            fu_jian_href_list.append(href)
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': content,
                            'contentWithTag': str(contentwithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1679", 'relationName': "云南省国资委",
                                        'labelMark': "policy"}],
                            'origin': '',
                            'organ': '',
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': None,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            log.info(title)
                            num = num + 1
                            count += 1
                    except:
                        pass
                resp.close()
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def yun_nan2():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 4):
            if page == 1:
                # url = 'http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_gkgz.shtml'
                url = 'http://gzw.yn.gov.cn/yngzw/c100095/zfxxgk_list.shtml'
            else:
                url = f'http://gzw.yn.gov.cn/yngzw/c100095/zfxxgk_list_{page}.shtml'
            try:
                res = requests.get(url=url, headers=baseTool.headers, verify=False)
                page_text = res.text.encode("ISO-8859-1")
                page_text = page_text.decode("utf-8")
                soup = BeautifulSoup(page_text, 'html.parser')
                li_list = soup.find('ul', attrs={'class': 'zfxxgk-nr-cnet'}).find_all('li')
                for li in li_list:
                    id_list = []
                    title = str(li.find('a').text).lstrip().strip()
                    pub_time = str(li.find('span').text).replace(' ', '').replace('\n', '')
                    href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        # print(href)
                        if '.shtml' in href:
                            res_ = requests.get(href, baseTool.headers)
                            page_text_ = res_.text.encode("ISO-8859-1")
                            page_text_ = page_text_.decode("utf-8")
                            page = BeautifulSoup(page_text_, 'html.parser')
                            # 相对路径转化为绝对路径
                            page = baseTool.paserUrl(page, href)
                            pub_hao = ''
                            try:
                                pub_hao_list = page.find('p', attrs={'class': 'MsoNormal'}).findAll('span')
                                for a in pub_hao_list:
                                    pub_hao = pub_hao + str(a.text)
                                if '﹝' not in pub_hao and '﹞' not in pub_hao:
                                    pub_hao = ''
                            except:
                                pub_hao = ''
                            contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'})
                            content = contentwithTag.text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                            fu_jian_list = contentwithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
                                    fu_jian_href = fu_jian['href']
                                except:
                                    continue
                                file_name = fu_jian.text
                                if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    # print(fu_jian_href)
                                    try:
                                        category = os.path.splitext(fu_jian_href)[1]
                                        if category not in file_name:
                                            file_name = file_name + category
                                        # 附件上传至文件服务器
                                        retData = baseCore.uptoOBS(fu_jian_href, '1679', file_name)
                                        if retData['state']:
                                            pass
                                        else:
                                            continue

                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num,
                                                                                 pub_time)
                                        id_list.append(att_id)
                                        # 将附件链接替换
                                        fu_jian['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                                    except:
                                        continue
                            res_.close()
                        elif 'display' in href:
                            continue
                        else:
                            continue

                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': content,
                            'contentWithTag': str(contentwithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1679", 'relationName': "云南省国资委",
                                        'labelMark': "policy"}],
                            'origin': '',
                            'organ': '',
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            log.info(title)
                            count += 1
                            num = num + 1
                    except:
                        pass
                res.close()
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    yun_nan1()
    yun_nan2()


if __name__ == "__main__":
    yun_nan()