# -*- coding: utf-8 -*-
"""
    从数据库中读取年报缺失年份，采集对应网站上的年报，存在两种情况，标题中有年份，标题中无年份。
    如果标题中有年份的话，按照原方式命名，有年份的应该都已经采过，跳过不插入更新
    如果标题中无年份的话，则解析正文内容，正则表达式匹配年份，
    采集一条，state 加1 如果报错的话就将state改为100，单独处理。

"""

import json
from datetime import datetime

from kafka import KafkaProducer
from base.BaseCore import BaseCore

baseCore = BaseCore()
import requests, re, time, pymysql, fitz
from bs4 import BeautifulSoup as bs
from selenium import webdriver

# chromedriver = "D:/chrome/chromedriver.exe"
# browser = webdriver.Chrome(chromedriver)

opt = webdriver.ChromeOptions()
opt.add_argument(
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')

opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
log = baseCore.getLogger()
requests.adapters.DEFAULT_RETRIES = 3
#11数据库
cnx = baseCore.cnx_
cursor = baseCore.cursor_
#144数据库
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}

def clean_text(text):
    """
        清理多余空行
    :param text:
    :return:
    """
    soup = bs(text, 'html.parser')
    # print(soup.get_text())
    text = soup.get_text()
    # str1 = re.sub('[\n]+', '\n', 'dfadf   d\n \n\n \nfa  ds ')
    text_ = re.sub('\n+', '\n', text.replace('\t', '').replace('\r', ''))
    return text_

def spider_annual_report(dict_info,num):
    social_code = dict_info['social_code']
    com_name = dict_info['com_name']
    code = dict_info['code']
    url_1 = f'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'

    browser.get(url_1)
    time.sleep(3)
    page_source = browser.page_source
    soup = bs(page_source, 'html.parser')
    # res_1 = requests.get(url_1, proxies=ip)
    # soup = bs(res_1.content, 'html.parser')
    try:
        list_all = soup.find('div', {'class': 'datelist'}).find_all('a')
    except:
        log.info(f'{social_code}.........年度报告列表为空')
        exception = '年度报告列表为空'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
        return

    for i in list_all:
        # ip = get_proxy()[random.randint(0, 3)]
        pdf_name_a = i.text
        if 'H股公告' in pdf_name_a:
            continue
        year_url = 'https://vip.stock.finance.sina.com.cn' + i.get('href')
        year_name = i.text
        browser.get(year_url)
        time.sleep(5)
        page_source_2 = browser.page_source
        # res_2 = requests.get(year_url, proxies=ip)
        soup_2 = bs(page_source_2, 'html.parser')

        try:
            pdf_url = soup_2.find('th', {'style': 'text-align:center'}).find('a').get('href')
        except:
            #todo:无连接但是有正文内容
            log.error(f'{social_code}....{year_url}....无下载链接')
            exception = '无下载链接'
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
            continue
        #日期
        pub_time = soup_2.find('td',{'class':'head'}).text.split('公告日期:')[1]
        # 将时间年月日字符串转换为datetime对象
        date_object = datetime.strptime(pub_time, "%Y-%m-%d")

        # 将datetime对象转换为年月日时分秒字符串
        datetime_string = date_object.strftime("%Y-%m-%d %H:%M:%S")

        try:
            # 标题中有年份，
            year = re.findall('\d{4}\s*年', year_name)[0]
            if com_name != 'null':

                name_pdf = f"{com_name}：{year}年度报告.pdf".replace('*', '')
            else:
                name_pdf = pdf_name_a + '.pdf'
        except:
            # 标题中无年份
            content = soup_2.find('div', {'id': 'content'}).text

            # 清除多余空行
            content_c = clean_text(content)
            for i in range(0, 4):
                # 取第i行的数据
                try:
                    line = content_c.split('\n')[i]
                    try:
                        # 正则表达式匹配年份
                        year_ = re.findall('\d{4}\s*年年度报告', line)[0]
                        year = re.findall('\d{4}', year_)[0]
                        if com_name != '':

                            name_pdf = f"{com_name}：{year}年年度报告.pdf".replace('*', '')
                        else:
                            name_pdf = pdf_name_a + '.pdf'
                        break
                    except:
                        try:
                            result = soup_2.find('td', class_='head').text
                            year = str(int(re.findall('\d{4}', result)[0]) - 1)
                            if com_name != '':
                                name_pdf = f"{com_name}：{year}年年度报告.pdf".replace('*', '')
                            else:
                                name_pdf = pdf_name_a + '.pdf'
                        except:
                            continue
                except:
                    # result = soup_2.find('td', class_='head').text
                    year = str(int(re.findall('\d{4}', pub_time)[0]) - 1)
                    if com_name != '':
                        name_pdf = f"{com_name}：{year}年年度报告.pdf".replace('*', '')
                    else:
                        name_pdf = pdf_name_a + '.pdf'
        # name_pdf = f"{com_name}：{year}年年报.pdf".replace('*', '')
        # name_pdf = pdf_name_a + '.pdf'

        if '年' in year:
            year = year.split('年')[0]
        else:
            pass
        sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
        cursor.execute(sel_sql, (social_code, int(year)))
        selects = cursor.fetchone()
        if selects:
            log.info(f'com_name:{com_name}、{year}已存在')
            continue
        else:
            #上传文件至obs服务器
            retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun')
            if retData['state']:
                pass
            else:
                log.info(f'====pdf解析失败====')
                continue
            num = num + 1
            try:
                origin = '雪球网'
                att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time,origin)
                content = retData['content']
                state = 1
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
            except Exception as e:
                exception = '数据库传输失败'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}')
                return False
            #发送数据到kafka
            lang = baseCore.detect_language(content)
            if lang == 'cn':
                lang = 'zh'
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            dic_news = {
                'attachmentIds': att_id,
                'author': '',
                'content': content,
                'contentWithTag': '',
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
                'keyWords': '',
                'lang': lang,
                'origin': origin,
                'publishDate': datetime_string,
                'sid': '1684032033495392257',
                'sourceAddress': year_url,  # 原文链接
                'summary': '',
                'title': name_pdf.replace(',pdf', ''),
                'type': 1,
                'socialCreditCode': social_code,
                'year': year
            }
            # 将相应字段通过kafka传输保存
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
                kafka_result = producer.send("researchReportTopic",
                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

                print(kafka_result.get(timeout=10))

                dic_result = {
                    'success': 'ture',
                    'message': '操作成功',
                    'code': '200',
                }
                log.info(dic_result)
                # return True
            except Exception as e:
                dic_result = {
                    'success': 'false',
                    'message': '操作失败',
                    'code': '204',
                    'e': e
                }
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
                log.info(dic_result)
                return False

            time.sleep(2)
            # browser.quit()
    return True


#state1
if __name__ == '__main__':
    num = 0
    taskType = '企业年报/雪球网'
    pathType = 'QYYearReport/'
    while True:
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
        social_code = '913412007050444417'
        if not social_code:
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        if social_code == '':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        count = dic_info[16]
        code = dic_info[3]
        com_name = dic_info[4]
        log.info(f'====开始采集====={social_code}====')
        if code is None:
            exeception = '股票代码为空'
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
            continue
        while True:
            if len(code) < 6:
                code = "0"+code
            else:
                break
        # years = tuple(call_year)
        dict_info = {
            'social_code':social_code,
            'com_name':com_name,
            'code':code,
        }
        # list_info.append(dict_info)
        if spider_annual_report(dict_info,num):

            count += 1
            runType = 'AnnualReportCount'
            baseCore.updateRun(social_code, runType, count)
        # break
    # cursor.close()
    cnx_.close()
    # 释放资源
    baseCore.close()



