import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 安徽
def an_hui():
    def an_hui1():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 4):
            url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
            try:
                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
                doc_resp = pq(resp_text)
                doc_items = doc_resp('tr[class="xxgk_nav_con"]').items()
                for doc_item in doc_items:
                    title = doc_item('a').attr('title').strip()
                    publishDate = doc_item('td[class="fbrq"]').text().strip()
                    href = doc_item('a').attr('href')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        # soup = BeautifulSoup(href_text.text, 'html.parser')
                        soup = baseTool.paserUrl(href_text.text, href)
                        doc = pq(str(soup))
                        title = doc(
                            'div[class="div_table_suoyin"]>table>tbody>tr:nth-child(4)>td[class="pmingcheng"]').text()
                        topicClassification = doc(
                            'div[class="div_table_suoyin"]>table>tbody>tr:nth-child(1)>td:nth-child(4)').text()
                        writtenDate = doc(
                            'div[class="div_table_suoyin"]>table>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                        organ = doc('div[class="div_table_suoyin"]>table>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                        pub_hao = doc(
                            'div[class="div_table_suoyin"]>table>tbody>tr:nth-child(3)>td:nth-child(4)').text()
                        origin = doc('div[class="wzfbxx"]>span[class="res"]').text().replace('信息来源：', '')
                        publishDate = doc('div[class="wzfbxx"]>span[class="fbsj"]').text().replace('发布日期：', '')
                        contentWithTag = doc('div[id="zoom"]')
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1688', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                        contentWithTag = str(soup.prettify())
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1688", 'relationName': "安徽省国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': organ,
                            'topicClassification': topicClassification,
                            'issuedNumber': pub_hao,
                            'publishDate': publishDate,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def an_hui2():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 25):
            url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
            try:
                res = requests.get(url=url, headers=baseTool.headers)
                res.encoding = res.apparent_encoding
                res_text = res.text
                soup = baseTool.paserUrl(res_text, url)
                doc = pq(str(soup))
                tr_list = doc('tr[class="xxgk_nav_con"]')
                for tr in tr_list:
                    trdoc = pq(tr)
                    title = trdoc('td[class="info"]>a').text()
                    href = trdoc('td[class="info"]>a').attr('href')
                    pub_hao = trdoc('td[class="fwrq"]').text()
                    writtenDate = trdoc('td[class="cwrq"]').text()
                    publishDate = trdoc('td[class="fwrq"]').text()
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_res = requests.get(url=href, headers=headers, verify=False)
                        href_res.encoding = href_res.apparent_encoding
                        href_text = href_res.text
                        # doc_href = BeautifulSoup(href_text, 'html.parser')
                        soup = baseTool.paserUrl(href_text, href)
                        doc = pq(str(soup))
                        # title=doc('div[class="div_table_suoyin"]>table>tbody>tr:nth-child(4)>td[class="pmingcheng"]').text()
                        topicClassification = doc(
                            'div[class="div_table_suoyin"]>table>tbody>tr:nth-child(1)>td:nth-child(4)').text()
                        # writtenDate=doc('div[class="div_table_suoyin"]>table>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                        organ = doc('div[class="div_table_suoyin"]>table>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                        # pub_hao=doc('div[class="div_table_suoyin"]>table>tbody>tr:nth-child(3)>td:nth-child(4)').text()
                        origin = doc('div[class="wzfbxx"]>span[class="res"]').text().replace('信息来源：', '')
                        # publishDate=doc('div[class="wzfbxx"]>span[class="fbsj"]').text().replace('发布日期：','')
                        contentWithTag = doc('div[id="zoom"]')
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1688', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                        contentWithTag = str(soup.prettify())
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1688", 'relationName': "安徽省国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': organ,
                            'topicClassification': topicClassification,
                            'issuedNumber': pub_hao,
                            'publishDate': publishDate,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                        href_res.close()
                    except:
                        pass
                res.close()
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    an_hui1()
    an_hui2()


if __name__ == "__main__":
    an_hui()