import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 天津
def tian_jin():
    def tian_jin1():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(0, 3):
            if page == 0:
                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
            else:
                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/index_{page}.html'
            try:
                baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
                req = requests.get(url=url, headers=baseTool.headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    i_href = str(li.find('a').get('href'))
                    if 'ZTZL' in i_href:
                        href = i_href.replace('../../../', 'https://sasac.tj.gov.cn/')
                    elif './' in i_href:
                        href = i_href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/')
                    else:
                        href = i_href
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
                        driver = baseTool.getDriver()
                        driver.get(href)
                        time.sleep(2)
                        href_text = driver.page_source
                        soup = baseTool.paserUrl(href_text, href)
                        doc_href = pq(str(soup))
                        title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
                        organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
                        issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
                        topicClassification = doc_href(
                            'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
                        writtenDate_ = doc_href('div[class="top-container"]>div:nth-child(6)>:nth-child(2)').text()
                        publishDate_ = doc_href('div[class="top-container"]>div:nth-child(7)>:nth-child(2)').text()
                        date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
                        writtenDate = date_obj1.strftime("%Y-%m-%d")
                        date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
                        publishDate = date_obj2.strftime("%Y-%m-%d")
                        doc_href('div[id="articlePlayer"]').remove()

                        contentWithTag = doc_href('div[id="xlrllt"]')

                        origin = ''
                        if len(title) < 1:
                            title = doc_href('div[class="common-content-mainTitle"]').text()
                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
                                                                                                        '').strip()
                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
                                                                                                             '').strip()
                            rmtag2 = doc_href('div[id="articlePlayer"]')
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')
                        if len(writtenDate) < 1:
                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())

                        if len(contentWithTag) < 1:
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': organ,
                            'topicClassification': topicClassification,
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except Exception as e:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def tian_jin2():
        """
        http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html  4
        """
        num = 0
        count = 0
        start_time = time.time()
        for page in range(0, 5):
            if page == 0:
                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html'
            else:
                url = f'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index_{page}.html'
            try:
                baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
                req = requests.get(url=url, headers=baseTool.headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    href = str(li.find('a').get('href'))
                    if 'http:' in href:
                        continue
                    else:
                        href = url.split('index')[0] + href.replace('./', '')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
                        driver = baseTool.getDriver()
                        driver.get(href)
                        time.sleep(2)
                        href_text = driver.page_source
                        soup = baseTool.paserUrl(href_text, href)
                        doc_href = pq(str(soup))
                        title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
                        organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
                        issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
                        topicClassification = doc_href(
                            'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
                        writtenDate_ = doc_href('div[id="content_cwrq"]').text()
                        publishDate_ = doc_href('div[id="content_fbrq"]').text()
                        date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
                        writtenDate = date_obj1.strftime("%Y-%m-%d")
                        date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
                        publishDate = date_obj2.strftime("%Y-%m-%d")
                        contentWithTag = doc_href('div[id="xlrllt"]')
                        origin = ''
                        if len(title) < 1:
                            title = doc_href('div[class="common-content-mainTitle"]').text()
                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
                                                                                                        '').strip()
                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
                                                                                                             '').strip()
                            rmtag2 = doc_href('div[id="articlePlayer"]')
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')
                        if len(writtenDate) < 1:
                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                        # id_ = redefid(id_list)
                        if id_list:
                            pass
                        else:
                            doc_href("ul[class='qt-attachments-list']").remove()
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': organ,
                            'topicClassification': topicClassification,
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def tian_jin3():
        num = 0
        count = 0
        start_time = time.time()
        for page in range(1, 3):
            if page == 1:
                url = 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index.html'
            else:
                # https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_1.html
                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_{page - 1}.html'
            try:
                req = requests.get(url, baseTool.headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    href = str(li.find('a').get('href'))
                    try:
                        publishDate = li.find('div', attrs={'class': 'other'}).text
                    except:
                        publishDate = None
                    if 'http' not in href:
                        if '../../../' in href:
                            href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
                        href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:

                        # res = requests.get(href, headers)
                        # page_text = res.text.encode("ISO-8859-1")
                        # page_text = page_text.decode("utf-8")
                        driver = baseTool.getDriver()
                        driver.get(href)
                        time.sleep(2)
                        href_text = driver.page_source
                        soup = baseTool.paserUrl(href_text, href)
                        doc_href = pq(str(soup))
                        title = doc_href('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                        organ = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        issuedNumber = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                        topicClassification = doc_href(
                            'table[class="bd1"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
                        writtenDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                        publishDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(4)').text()

                        contentWithTag = doc_href('div[id="UCAP-CONTENT"]')
                        origin = ''
                        if len(title) < 1:
                            title = doc_href('div[class="common-content-mainTitle"]').text()
                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
                                                                                                        '').strip()
                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
                                                                                                             '').strip()
                            rmtag2 = doc_href('div[id="articlePlayer"]')
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')

                            if len(title) < 1:
                                doc_href = doc_href('div[aria-label="内容文本区"]')
                                doc_soup = BeautifulSoup(str(doc_href), 'html.parser')
                                info_list = doc_soup.find('tbody').find('tbody').find('tr').find_all('table')
                                title_tag = info_list[0]
                                organ = info_list[2].find('span', id="laiyuan").text
                                publishDate = info_list[2].find_all('td', class_="hui12")[-1].text
                                contentWithTag = info_list[-1]

                        if len(writtenDate) < 1:
                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': organ,
                            'topicClassification': topicClassification,
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except Exception as e:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    tian_jin1()
    tian_jin2()
    tian_jin3()

if __name__ == "__main__":
    tian_jin()