import os
import time

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service

from base import BaseCore
import time
from selenium.webdriver import Firefox
from selenium import webdriver



baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}

headers_ = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Cookie': 'token=db51d0a6-06e1-49f6-8e4f-8cec52c47bec; uuid=db51d0a6-06e1-49f6-8e4f-8cec52c47bec;',
    'Host': 'www.hubei.gov.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.hubei.gov.cn/site/hubei/search.html',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers_, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getDataJson(page):
    ip = baseCore.get_proxy()
    url = f'http://www.hubei.gov.cn/igs/front/search/list.html?index=hb-govdoc&type=govdoc&pageNumber={page}&pageSize=10&filter[AVAILABLE]=true&filter[fileNum-like]=&filter[Effectivestate]=&filter[fileYear]=&filter[fileYear-lte]=&filter[Subjectclass]=&filter[CateGory]=&filter[DOCTITLE,DOCCONTENT,fileNum-or]=REITs&code=872801132c71495bbe5a938f6acff5aa&siteId=50&filter[SITEID]=54&orderProperty=PUBDATE&orderDirection=desc&6LDjm9Ls=0MADqxalqEiunxfMA3PwdIsvIxiRRQzDxXUAXPlbOXcZq0Rg0iIRTAWPM5NCpsIcnfs9rjzmAOc6t7j5dB4VBmMHY3KtuQHQ6bnSkbepFXgB0I.UuQKzMa5IqQB19wRAMEmnB7VYU4cW'
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.json()['page']['content']


def getContent(driver, url, num):
    driver.get(url)
    time.sleep(5)
    fjhref_list = ''
    fjtitle_list = ''
    publishDate = driver.find_element(By.CLASS_NAME,'hbgov-article-meta-time').text.split('发布时间：')[1].lstrip().strip()
    contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
    img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
    num_ = 1
    for img in img_list:
        fj_title = img.get_attribute('title')
        fj_href = img.get_attribute('src')
        fjhref_list += fj_href + '\n'
        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
        fjtitle_list += fj_title + '\n'
        fjcontent = getFjContent(fj_href)
        file = f'./相关政策/湖北省人民政府/政策文件/{fj_title}'
        if os.path.exists(file):
            fj_title = fj_title.replace(category,f'-{num_}') + category
            num_ += 1
        with open(file, 'wb') as f:
            f.write(fjcontent)
        log.info(f'{fj_title}===附件下载成功')
    content = contentWithTag.text.lstrip().strip()
    fjtitle_list = fjtitle_list.lstrip().strip()
    fjhref_list = fjhref_list.lstrip().strip()
    return publishDate, content, fjtitle_list, fjhref_list


def getData(driver, data_, num):
    title = data_['DOCTITLE']
    origin = data_['SITENAME']
    pub_hao = data_['fileNum']
    writtenDate = data_['PUBDATE']
    organ = data_['publisher']
    summary = data_['highlight']['DOCCONTENT'][0]
    href = data_['DOCPUBURL']
    publishDate, content, fjtitle_list, fjhref_list = getContent(driver, href, num)
    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
            fjhref_list]
    return data


def doJob():
    service = Service(r'F:\spider\firefox\geckodriver.exe')
    options = Options()
    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    driver = webdriver.Firefox(options=options, service=service)
    if not os.path.exists('./相关政策/湖北省人民政府/政策文件'):
        os.makedirs('./相关政策/湖北省人民政府/政策文件')
    data_list = []
    num = 1
    for page in range(1, 3):
        data_json = getDataJson(page)
        for data_ in data_json:
            data = getData(driver, data_, num)
            data_list.append(data)
            log.info(f'{data[1]}===采集成功')
            num += 1
    driver.close()
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
    df.to_excel('./相关政策/湖北省人民政府/湖北省人民政府政策文件.xlsx', index=False)

#
if __name__ == '__main__':
    doJob()
    # service = Service(r'F:\spider\firefox\geckodriver.exe')
    # options = Options()
    # options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    # driver = webdriver.Firefox(options=options, service=service)
    # driver.get('http://www.hubei.gov.cn/zfwj/ezf/202208/t20220801_4245008.shtml')
    # time.sleep(5)
    # contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
    # img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
    # num = 1
    # for img in img_list:
    #     fj_href = img.get_attribute('src')
    #     fjcontent = getFjContent(fj_href)
    #     with open(f'./{num}.png','wb') as f:
    #         f.write(fjcontent)
    #     num += 1
    baseCore.close()
