import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 贵州
def gui_zhou():
    """
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/  11
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/  1
    """
    num = 0
    count = 0
    start_time = time.time()
    for page in range(0, 11):
        if page == 0:
            url = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist.html'
        else:
            url = f'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.c').items()
            for doc_item in doc_items:
                id_list = []
                href = doc_item('a').attr('href')
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    # print(href)
                    # href = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/hyzcfg/202110/t20211026_71215292.html'
                    title = doc_item('a').text().strip()
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                    if '404 Not Found' in href_text.text:
                        continue
                    doc_href = pq(href_text.content)
                    # 发文机构
                    organ = doc_href('#NewsArticleSource').text()
                    pub_result = doc_href('.xxgk_xl_top').text().replace('var str = ""; var str_1 = "', '').replace(
                        '"; if (str == "") { document.write(str_1); } else { document.write(str); }', '')
                    pub_time = pub_result.split('发文日期: ')[1].split('文号:')[0].strip().replace('年', '-').replace('月',
                                                                                                               '-').replace(
                        '日', ' ') + ' 00:00:00'
                    # origin
                    pub_source = pub_result.split('发布机构:')[1].split('发文日期:')[0].strip()
                    pub_hao = pub_result.split('文号:')[1].split('是否有效:')[0].strip()
                    topicClassification = pub_result.split('信息分类:')[1].split('发布机构:')[0].strip()
                    if pub_source == '无':
                        pub_source = ''
                    if pub_hao == '无':
                        pub_hao = ''
                    contentWithTag = doc_href('#Zoom').children()
                    contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
                    contentWithTag = baseTool.paserUrl(contentWithTag, href)

                    content = contentWithTag.text.strip()
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    fu_jian_list = contentWithTag.find_all('a')
                    for fu_jian in fu_jian_list:
                        try:
                            fu_jian_href = fu_jian['href']
                        except:
                            continue
                        file_name = fu_jian.text.strip()
                        if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            # 附件上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1694', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue

                            att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num, pub_time)
                            id_list.append(att_id)
                            # 将附件链接替换
                            fu_jian['href'] = 'http:obs.ciglobal.cn/' + str(full_path)

                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1694", 'relationName': "贵州省国资委", 'labelMark': "policy"}],
                        'origin': pub_source,
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        count += 1
                        num = num + 1
                except Exception as e:
                    pass
        except Exception as e:
            pass
    end_time = time.time()
    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    gui_zhou()