#coding=utf-8
import os
import time

import requests
from bs4 import BeautifulSoup
import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'policy'
webname = '黑龙江省人民政府'
headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'Token': '9a9ff46e-f534-43b8-bad1-063d80af7e51',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getDataJson():
    ip = baseCore.get_proxy()
    url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
    data_post = {
        'sort': 'smartIndex',
        'order': 'asc',
        'start': '0',
        'length': '20',
        'filter.all': 'REITs',
    }
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    data_json = req.json()['content']['content']
    return data_json


def getSoup(url):
    # ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    print(req.json())
    soup = BeautifulSoup(req.json()['content']['html'], 'lxml')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
    id_list = []
    url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
    writtenDate = None
    if type == '政策解读':
        origin = organ
        organ = ''
        href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyResolve/{id}'
    else:
        origin = '黑龙江省人民政府'
        href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyFile/findById/{id}'
    # 根据链接判重
    is_member = baseCore.r.sismember('REITs::' + webname, url)
    if is_member:
        return
    soup = getSoup(href_)
    try:
        a_list = soup.find_all('a')
        for a in a_list:
            href = a.get('href')
            if '.html' in href or '.shtml' in href or '.htm' in href:
                continue

            category = os.path.splitext(href)[1]
            fj_title = a.text.lstrip().strip()
            if '<' in fj_title or '>' in fj_title:
                fj_title = fj_title.replace('<', '').replace('>', '')
            if category not in fj_title:
                fj_title = fj_title + category
            att_id, full_path = policy.attuributefile(fj_title,href,num,publishDate)
            if att_id:
                id_list.append(att_id)
                a['href'] = full_path
    except Exception as e:
        log.error(title, '=====', e)
    try:
        scripts = soup.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = soup.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = soup.text.lstrip().strip()
    contentWithTag_str = str(soup)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_info = {
        'attachmentIds': id_list,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '',
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': url,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': '1729042585839841281'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, url)
        log.info(f'采集成功--{title}--{url}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


def doJob():

    num = 1
    data_json = getDataJson()
    for data_ in data_json:
        title = data_['title']
        publishDate = data_['date']
        summary = data_['content']
        id = data_['dataId']
        type = data_['typeName']
        try:
            pub_hao = data_['writtenText']
        except:
            pub_hao = ''
        try:
            organ = data_['unitShowName']
        except:
            organ = ''
        data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
        # data_list.append(data)
        num += 1
        time.sleep(3)


if __name__ == "__main__":
    doJob()
    baseCore.close()
