"""
    新浪财经国内企业公告
"""
from datetime import datetime
import json
import re
import time

import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from base.BaseCore import BaseCore

taskType = '企业公告/新浪财经/国内'
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
r = baseCore.r
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
}
pattern = r"\d{4}-\d{2}-\d{2}"
pathType = 'QYNotice/'

def ifInstert(social_code, pdf_url):
    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='新浪财经' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    return selects


@retry(tries=3, delay=1)
def sendKafka(dic_news):
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

    print(kafka_result.get(timeout=10))


def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time, com_name, num):
    # 判断文件是否已经存在obs服务器中
    # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
    soup = getrequests(pdf_url)
    pdf_url = soup.find('table',attrs={'id':'allbulletin'}).find('tr',class_='gray').find('a').get('href')
    now_time = time.strftime("%Y-%m")
    file_path = 'XLCJNotice/' + now_time + '/' + pdf_name + '.pdf'
    response = baseCore.obsexist(file_path)
    if not response:
        return False
    # 上传至华为云服务器
    retData = baseCore.uptoOBS(pdf_url, pdf_name, 8, social_code, pathType, taskType, start_time,'LiuLiYuan')
    # 附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
    att_id = baseCore.tableUpdate(retData, com_name, year, pdf_name, num,pub_time)
    if att_id:
        pass
    else:
        return False
    content = retData['content']

    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': att_id,
        'author': '',
        'content': content,
        'contentWithTag': '',
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': 'zh',
        'origin': '新浪财经',
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': pdf_url,  # 原文链接
        'summary': '',
        'title': pdf_name,
        'type': 3,
        'socialCreditCode': social_code,
        'year': year
    }
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
        sendKafka(dic_news)
        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
        log.info(dic_result)
        return False


# 判断时间是否是正确格式
def format_time(time_str):
    try:
        # 尝试将时间字符串按指定格式解析为datetime对象
        datetime_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
        # 检查解析后的时间对象是否与原字符串完全匹配
        if datetime_obj.strftime("%Y-%m-%d %H:%M:%S") == time_str:
            return time_str
    except ValueError:
        pass
    # 如果无法解析为指定格式，则格式化为"%Y-%m-%d %H:%M:%S"
    try:
        formatted_time = datetime.strftime(datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
    except:
        formatted_time = datetime.strftime(datetime.strptime(time_str, "%Y-%m-%d"), "%Y-%m-%d %H:%M:%S")
    return formatted_time


# 获取响应页面
@retry(tries=3, delay=1)
def getrequests(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


# 数据库插入
@retry(tries=3, delay=1)
def insertMysql(social_code, link, pub_time):
    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
    # 动态信息列表
    list_info = [
        social_code,
        link,
        '新浪财经',
        '1',
        pub_time
    ]
    cursor.execute(insert_sql, tuple(list_info))
    cnx.commit()


def doJob():
    # while True:
    # social_code = baseCore.redicPullData()
    start_time = time.time()
    social_code = '91440300192185379H'
    data = baseCore.getInfomation(social_code)
    gpdm = data[3]
    com_name = data[1]
    short_name = data[4]
    log.info(f'{social_code}==={gpdm}===开始采集')
    if gpdm == '' or not gpdm:
        log.error(f'{social_code}===股票代码为空')
        # continue
    page = 1
    num = 1
    while True:
        url = f'https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={gpdm}&Page={page}'
        print(url)
        soup = getrequests(url)
        # if '拒绝访问' in soup.text:
        #     log.error(f'{social_code}===ip封禁')
        #     state = 0
        #     takeTime = baseCore.getTimeCost(start_time, time.time())
        #     baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
        #     # r.rpush('NewsEnterprise:gnqy_nyse_socialCode',social_code)
        #     time.sleep(1800)
        #     break
        # try:
        div_flg = soup.find('div', class_='tagmain').text
        if '暂时没有数据！' in div_flg:
            log.info(f"{social_code}==={gpdm}===没有公告")
        else:
            ul = soup.find('div', class_='datelist').find('ul')
            a_list = ul.find_all('a')
            time_list = re.findall(pattern, str(ul))
            for i in range(len(a_list)):
                # try:
                name_pdf = a_list[i].text.lstrip().strip()
                if name_pdf == '':
                    continue
                href = a_list[i].get('href')
                if 'http' not in href:
                    href = 'https://vip.stock.finance.sina.com.cn' + href
                soup_href = getrequests(href)
                th = soup_href.find('table',attrs={'id':'allbulletin'}).find_all('th')[0]
                if '下载公告' in th.text:
                    pdf_url = th.find('a',attrs={'target':'_blank'}).get('href')
                else:
                    log.error(f"{href}===没有公告下载地址")
                    continue
                selects = ifInstert(social_code, pdf_url)
                if selects:
                    log.info(f'{pdf_url}===已采集')
                    continue
                pub_time = format_time(time_list[i])
                year = pub_time[:4]
                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time, com_name, num)
                if result:
                    # 公告信息列表
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, href, '成功')
                    try:
                        # 发送kafka成功之后 再插入数据库
                        insertMysql(social_code, href, pub_time)
                        log.info(f"{social_code}==={href}===数据库插入成功")
                    except:
                        log.info(f"{social_code}==={href}===数据库插入失败")
                        continue
                else:
                    continue
            next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
            if '下一页' not in next_flg:
                break
            page += 1
            break
    # log.info(
    #     f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')


if __name__ == '__main__':
    doJob()
