import time
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'policy'
webname = '辽宁省人民政府'
headers = {
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}


def getContent(url):
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    contentWithTag = soup.find('div', class_='zfwj_detail')
    pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip()

    return contentWithTag, pub_hao


def doJob():
    url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c'
    # driver = baseCore.buildDriver()
    driver = policy.createDriver()
    driver.get(url)
    time.sleep(1)
    driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME,
                                                                                                     'label').click()
    time.sleep(1)
    driver.find_element(By.CLASS_NAME, 'search_inps').send_keys('REITs')
    driver.find_element(By.CLASS_NAME, 'search_btns').click()
    time.sleep(1)
    div_list = driver.find_elements(By.CLASS_NAME, 'searchMod')
    num = 1

    for div in div_list:
        title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip()
        href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
        # 根据链接判重
        is_member = baseCore.r.sismember('REITs::' + webname, href)
        if is_member:
            continue
        summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n',
                                                                                                        '').lstrip().strip()
        publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间：')[1].replace('年', '-').replace('月',
                                                                                                              '-').replace(
            '日', '').lstrip().strip()
        contentWithTag, pub_hao = getContent(href)
        content = contentWithTag.text.lstrip().strip()
        contentWithTag_str = str(contentWithTag)
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        dic_info = {
            'attachmentIds': [],
            'author': '',
            'content': content,
            'contentWithTag': contentWithTag_str,
            'deleteFlag': 0,
            'checkStatus': 1,
            'id': '',
            'title': title,
            'publishDate': publishDate,
            'origin': '辽宁省人民政府',
            'sourceAddress': href,
            'writtenDate': None,
            'organ': '',
            'topicClassification': '',
            'issuedNumber': pub_hao,
            'summary': summary,
            'createDate': time_now,
            'sid': '1729042213737967618'
        }
        try:
            baseCore.sendkafka(dic_info, topic)
            baseCore.r.sadd('REITs::' + webname, href)
            log.info(f'采集成功--{title}--{href}')
        except Exception as e:
            continue
        num += 1
    driver.close()


if __name__ == '__main__':
    doJob()
    baseCore.close()
