import os
import time

from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 湖北
def hu_bei(chromr_bin=None):
    num = 0
    count = 0
    start_time = time.time()
    hrefs = []
    url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
    chrome_driver = baseTool.driver_path
    path = Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.binary_location = baseTool.chromr_bin
    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
    driver.get(url)
    time.sleep(2)
    ul = driver.find_element(By.ID, 'ulList')
    li_list = ul.find_elements(By.TAG_NAME, 'li')
    time.sleep(1)
    for li in li_list:
        href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
        hrefs.append(href)
    for href in hrefs:
        is_href = baseTool.db_storage.find_one({'网址': href})
        if is_href:
            num += 1
            continue
        try:
            driver.get(href)
            time.sleep(2)
            dhtml = driver.page_source
            if len(dhtml) < 400:
                driver.get(href)
                time.sleep(2)
            doc = pq(dhtml)
            article = doc('div[class="article"]')
            adoc = pq(article)
            title = adoc('h2').text()
            publishDate = adoc('div[class="info"]>span:nth-child(1)').text()
            origin = adoc('div[class="info"]>span:nth-child(3)').text()
            organ = ''
            topicClassification = adoc('td[bfdi="93"]').text()
            issuedNumber = adoc('td[bfdi="101"]').text()
            writtenDate = adoc('td[bfdi="98"]').text()
            rmtag = adoc('p:contains("附件：")')
            rmtag2 = adoc('div[class="hbgov-qrcode-content"]')
            rmtag.remove()
            rmtag2.remove()
            contentWithTag = adoc('div[class="article-box"]')
            soup = baseTool.paserUrl(str(contentWithTag), href)
            fu_jian_soup = soup.find_all('a')
            id_list = []
            for file in fu_jian_soup:
                try:
                    file_href = file['href']
                except Exception as e:
                    continue
                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
                    if category not in file_name:
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1675', file_name)
                    if retData['state']:
                        pass
                    else:
                        continue
                    att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num, publishDate)
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
                    file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
            # id_ = redefid(id_list)
            contentWithTag = str(soup.prettify())
            if len(contentWithTag) < 1:
                if len(fu_jian_soup) < 1:
                    continue
            content = soup.text
            if content == '' or content == None:
                log.info(f'-----{href}----{title}----内容为空-----')
                continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
                'attachmentIds': id_list,
                'author': '',
                'content': str(content),
                'contentWithTag': str(contentWithTag),
                'createDate': time_now,
                'deleteFlag': 0,
                'id': '',
                'labels': [{'relationId': "1675", 'relationName': "湖北省国资委", 'labelMark': "policy"}],
                'origin': origin,
                'organ': organ,
                'topicClassification': topicClassification,
                'issuedNumber': issuedNumber,
                'publishDate': publishDate,
                'writtenDate': writtenDate,
                'sid': '1697458829758697473',
                'sourceAddress': href,
                'summary': '',
                'title': title
            }
            # print(dic_news)
            flag = baseTool.sendKafka(dic_news)
            if flag:
                baseTool.save_data(dic_news)
                num += 1
                count += 1
        except Exception as e:
            pass
    driver.close()
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
    hu_bei()