import datetime
import os
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 新疆
def xin_jiang():
    def xin_jiang1():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 10):
            if page == 1:
                url = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/list_tj.shtml'
            else:
                url = f'http://gzw.xinjiang.gov.cn/gzw/zcwj/list_tj_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('.list.pt20 li').items()
                for doc_item in doc_items:
                    title = doc_item('a').attr('title').strip()
                    href = 'http://gzw.xinjiang.gov.cn' + doc_item('a').attr('href')
                    if '/gzw/zcwj/' not in href:
                        continue
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    #         href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
                    try:
                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                        href_text = href_text.text.encode("ISO-8859-1")
                        href_text = href_text.decode("utf-8")
                        doc_href = BeautifulSoup(href_text, 'html.parser')
                        publishDate = doc_href.find('span', attrs={'class', 'date'}).text.replace('日期：', '').replace(
                            '∶',
                            ':') + ':00'
                        publishDate = publishDate.strip()
                        origin = doc_href.find('span', attrs={'class', 'from'}).text.replace('来源：', '').strip()
                        contentWithTag = str(doc_href.select('#NewsContent')[0])
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1682', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,
                                                                         publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
                        match_list = re.findall(pattern, content)
                        if len(match_list) > 0:
                            issuedNumber = match_list[0][0]
                            if len(issuedNumber) > 20:
                                issuedNumber = ''
                        else:
                            issuedNumber = ''
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1682", 'relationName': "新疆维吾尔自治区国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': "",
                            'topicClassification': "",
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def xin_jiang_jsbt():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
                url = 'http://gyzc.xjbt.gov.cn/xxgk/zcfg/'
            else:
                url = f'http://gyzc.xjbt.gov.cn/xxgk/zcfg/index_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
                doc_resp = pq(resp_text.content)
                doc_items = doc_resp('.article').items()
                for doc_item in doc_items:
                    title = doc_item('a').text().strip()
                    publishDate = doc_item('.time.pull-right').text().strip() + ' 00:00:00'
                    href = doc_item('a').attr('href')
                    if 'http' not in href:
                        href = 'http://gyzc.xjbt.gov.cn' + href
                    # is_href = baseTool.db_storage.find_one({'网址': href})
                    # if is_href:
                    #     num += 1
                    #     continue
                    try:
                        href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
                        href_res.encoding = href_res.apparent_encoding
                        res_text = href_res.text
                        soup = BeautifulSoup(res_text, 'html.parser')
                        pub_result = soup.find('div', attrs={'class': 'title_info'}).text
                        origin = pub_result.split('信息来源：')[1].split('编辑：')[0].strip()
                        contentWithTag = str(soup.find('div', attrs={'id': 'detail'}))
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1698', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,
                                                                         publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
                        match_list = re.findall(pattern, content)
                        if len(match_list) > 0:
                            issuedNumber = match_list[0][0]
                            if len(issuedNumber) > 20:
                                issuedNumber = ''
                        else:
                            issuedNumber = ''
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1698", 'relationName': "新疆生产建设兵团国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': "",
                            'topicClassification': "",
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                        href_res.close()
                    except:
                        pass
                resp_text.close()
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    xin_jiang1()
    xin_jiang_jsbt()

if __name__ == "__main__":
    xin_jiang()