"""
中证智能财讯
"""
import json
import sys
import time
from obs import ObsClient
import fitz
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By
from selenium import webdriver
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
def create_driver():
    path = r'D:\soft\msedgedriver.exe'

    # options = webdriver.EdgeOptions()
    options = {
        "browserName": "MicrosoftEdge",
        "ms:edgeOptions": {
            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
        }
    }

    driver = webdriver.Edge(executable_path=path, capabilities=options)
    return driver

@retry(tries=3, delay=1)
def getOBSres(pathType, name, response):
    result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
    # result = obsClient.putFile('zzsn', pathType+name, file_path=response)
    return result

def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
        headers = {}
        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
                   'create_time': '', 'page_size': '', 'content': ''}
        headers['User-Agent'] = baseCore.getRandomUserAgent()
        for i in range(0, 3):
            try:
                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
                file_size = int(response.headers.get('Content-Length'))
                break
            except:
                time.sleep(3)
                continue
        page_size = 0
        name = str(baseCore.getuuid()) + '.pdf'
        now_time = time.strftime("%Y-%m")
        try:
            result = getOBSres(pathType, now_time, name, response)
        except:
            log = baseCore.getLogger()
            log.error(f'OBS发送失败')
            return retData
        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                page_size = doc.page_count
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            log = baseCore.getLogger()
            log.error(f'文件损坏')
            return retData

        if page_size < 1:
            # pdf解析失败
            # print(f'======pdf解析失败=====')
            return retData
        else:
            try:
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                retData['state'] = True
                retData['path'] = result['body']['objectUrl'].split('.com')[1]
                retData['full_path'] = result['body']['objectUrl']
                retData['file_size'] = baseCore.convert_size(file_size)
                retData['create_time'] = time_now
                retData['page_size'] = page_size
            except Exception as e:
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
                return retData

            return retData

def zzcx():
    url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
    payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
    headers = {
        'Accept': 'application/json',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Content-Length': '56',
        'Content-Type': 'application/json;charset=UTF-8',
        'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Origin': 'https://zzcx.cs.com.cn',
        'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
    }
    payload = json.dumps(payload)
    result_json = requests.post(url=url, data=payload, headers=headers).json()
    print(result_json)
    pages = result_json['data']['pages']
    for page in range(1, int(pages + 1)):
        payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
        payload_page = json.dumps(payload_page)
        datas = requests.post(url=url, data=payload_page, headers=headers)
        records = datas.json()['data']['records']
        for news in records:
            title = news['title']
            news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']

            # 使用模拟浏览器打开
            driver = create_driver()
            driver.get(news_url)
            div_ = driver.find_element(By.ID, 'line')
            div = div_.find_element(By.XPATH, '..')
            image_data = div.screenshot_as_base64
            # todo:保存到obs链接及标签替换
            baseCore.uptoOBS()
            html = driver.page_source

            news_req = requests.get(url=news_url, headers=headers)
            news_soup = BeautifulSoup(news_req.content, 'html.parser')
            detail_info = news_soup.find('div', class_='subTitle___svblj')
            div_list = detail_info.find_all('div')
            origin = div_list[0].text
            publishDate = div_list[1].text
            contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
            content = contentWithTag.text
            info_code = 'IN-20240129-0001'
            result_dict = {
                'id': '',
                'sid': '1751787750127857666',
                'title': title,
                'organ': origin,
                'origin': '国务院国有资产监督管理委员会',
                # '摘要': zhaiyao,
                'source': 16,
                'content': content,
                'contentWithTag': contentWithTag,
                'publishDate': publishDate,
                'sourceAddress': news_url,
            }
            log.info(f'{page}--{title}--{href}')
            # info_list.append(result_dict)
            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
            try:
                kafka_result = producer.send("crawlerInfo",
                                             json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
                r.sadd(info_code + '-test', href)
                log.info('发送kafka成功！')
            except Exception as e:
                log.info(e)
            finally:
                producer.close()
if __name__ == "__main__":
    zzcx()