import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 重庆
def chong_qing():
    """
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/xzgfxwj/  4
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
    """
    num = 0
    count = 0
    pathType = 'policy/chongqing/'
    start_time = time.time()
    for page in range(0, 4):
        if page == 0:
            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index.html'
        else:
            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_{}.html'.format(page)
        #     url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_3.html'
        try:
            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.zsj-fr-main').items()
            for doc_item in doc_items:
                id_list = []
                titles = doc_item('a').items()
                for title_item in titles:
                    title = title_item.text().strip()
                    href = title_item('a').attr('href')
                    if '../' in href:
                        href = url.split('zcwj/index')[0] + title_item('a').attr('href').replace('../', '')
                    else:
                        href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        # print(href)
                        # href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
                        doc_href = pq(href_text)
                        try:
                            pub_result = doc_href('.zwxl-table').text().replace(' ', '')
                            pub_time = pub_result.split('[发布日期]')[1].strip() + ' 00:00:00'
                            pub_hao = pub_result.split('[发文字号]')[1].split('[主题分类]')[0].strip()
                            topicClassification = pub_result.split('[主题分类]')[1].split('[体裁分类]')[0].strip()
                            origin = pub_result.split('[发布机构]')[1].split('[成文日期]')[0].strip()
                            writtenDate = pub_result.split('[成文日期]')[1].split('[发布日期]')[0].strip()
                            doc_href = BeautifulSoup(str(doc_href), 'html.parser')
                            # 相对路径转化为绝对路径
                            doc_href = baseTool.paserUrl(doc_href, href)
                            # 去掉扫一扫
                            try:
                                doc_href.find('div', id='div_div').decompose()
                                # 去掉分享
                                doc_href.find('div', class_='bdsharebuttonbox').decompose()
                            except:
                                pass
                            contentWithTag = doc_href.find('div', class_='zwxl-article')
                            content = contentWithTag.text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                        except:
                            origin = ''
                            topicClassification = ''
                            pub_time = None
                            writtenDate = None
                            pub_hao = ''
                            contentWithTag = doc_href.find('div', class_='zwxl-content')
                            content = contentWithTag.text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                        fu_jian_list = contentWithTag.find_all('a')
                        # print(fu_jian_list)
                        for fu_jian in fu_jian_list:
                            try:
                                fu_jian_href = fu_jian['href']
                            except:
                                continue
                            file_name = fu_jian.text
                            if '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xlsx' in fu_jian_href or '.zip' in fu_jian_href \
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                try:
                                    category = os.path.splitext(fu_jian_href)[1]
                                    if category not in file_name:
                                        file_name = file_name + category
                                    # 附件上传至文件服务器
                                    retData = baseCore.uptoOBS(fu_jian_href, '1693', file_name)
                                    if retData['state']:
                                        pass
                                    else:
                                        continue

                                    att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num,
                                                                             pub_time)
                                    id_list.append(att_id)
                                    # 将附件链接替换
                                    fu_jian['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                                except:
                                    continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': content,
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
                                        'labelMark': "policy"}],
                            'origin': origin,
                            'organ': '',
                            'topicClassification': topicClassification,
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            log.info(title)
                            count += 1
                            num += 1
                    except:
                        pass
        except:
            pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    chong_qing()