import os
import time

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
import time
from selenium.webdriver import Firefox
from selenium import webdriver

import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'research_center_fourth'
webname = '湖北省人民政府'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}

headers_ = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Cookie': 'token=db51d0a6-06e1-49f6-8e4f-8cec52c47bec; uuid=db51d0a6-06e1-49f6-8e4f-8cec52c47bec;',
    'Host': 'www.hubei.gov.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.hubei.gov.cn/site/hubei/search.html',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers_, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getDataJson(page):
    ip = baseCore.get_proxy()
    url = f'http://www.hubei.gov.cn/igs/front/search/list.html?index=hb-govdoc&type=govdoc&pageNumber={page}&pageSize=10&filter[AVAILABLE]=true&filter[fileNum-like]=&filter[Effectivestate]=&filter[fileYear]=&filter[fileYear-lte]=&filter[Subjectclass]=&filter[CateGory]=&filter[DOCTITLE,DOCCONTENT,fileNum-or]=REITs&code=872801132c71495bbe5a938f6acff5aa&siteId=50&filter[SITEID]=54&orderProperty=PUBDATE&orderDirection=desc&6LDjm9Ls=0MADqxalqEiunxfMA3PwdIsvIxiRRQzDxXUAXPlbOXcZq0Rg0iIRTAWPM5NCpsIcnfs9rjzmAOc6t7j5dB4VBmMHY3KtuQHQ6bnSkbepFXgB0I.UuQKzMa5IqQB19wRAMEmnB7VYU4cW'
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.json()['page']['content']


def getContent(driver, url, num):
    driver.get(url)

    time.sleep(5)
    id_list = []
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    policy.paserUrl(soup, url)
    publishDate = soup.find(class_='hbgov-article-meta-time').text.split('发布时间：')[1].lstrip().strip()[:10]
    time.sleep(2)
    contentWithTag = soup.find(class_='hbgov-article-content')

    img_list = contentWithTag.find_all(class_='img')
    num_ = 1
    for img in img_list:
        fj_title = img.get_attribute('title')
        fj_href = img.get_attribute('src')

        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        if att_id:
            id_list.append(att_id)
            img['href'] = full_path
    content = contentWithTag.text.lstrip().strip()

    return publishDate, content, contentWithTag, id_list


def getData(driver, data_, num):
    title = data_['DOCTITLE']
    origin = data_['SITENAME']
    pub_hao = data_['fileNum']
    writtenDate = str(data_['PUBDATE'])[:10]
    organ = data_['publisher']
    summary = data_['highlight']['DOCCONTENT'][0]
    href = data_['DOCPUBURL']
    # 根据链接判重
    is_member = baseCore.r.sismember('REITs::' + webname, href)
    if is_member:
        return
    publishDate, content, contentWithTag, id_list = getContent(driver, href, num)

    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'subjectId': '1729315113088765953',
        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': href,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary.replace('</em>', '').replace('<em>', ''),
        'createDate': time_now,
        'sid': '1729044085724860418'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, href)
        log.info(f'采集成功--{title}--{href}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


def doJob():
    # service = Service(r'D:/soft/geckodriver.exe')
    service = Service(r'F:\spider\firefox\geckodriver_1.exe')
    options = Options()
    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    driver = webdriver.Firefox(options=options, service=service)

    num = 1
    for page in range(1, 3):
        data_json = getDataJson(page)
        for data_ in data_json:
            data = getData(driver, data_, num)

            num += 1
    driver.close()

#
if __name__ == '__main__':
    doJob()
    # service = Service(r'F:\spider\firefox\geckodriver.exe')
    # options = Options()
    # options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    # driver = webdriver.Firefox(options=options, service=service)
    # driver.get('http://www.hubei.gov.cn/zfwj/ezf/202208/t20220801_4245008.shtml')
    # time.sleep(5)
    # contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
    # img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
    # num = 1
    # for img in img_list:
    #     fj_href = img.get_attribute('src')
    #     fjcontent = getFjContent(fj_href)
    #     with open(f'./{num}.png','wb') as f:
    #         f.write(fjcontent)
    #     num += 1
    baseCore.close()
