"""
证监会公告采集，只能按照搜索企业来采，从上市库里拿企业数据，sys_enterprise_ipo_copy1
craw_state:已采集过表示为True,未采集表示为0，拿取数据表示为ing，解析失败表示为400
update_state：为1 表示需要更新，用来增量循环
如何统计出来该报告采到了没有，dt_error库统计失败的信息
"""
import json
import re
import time

import fitz
import pymysql
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from datetime import datetime
from base import BaseCore
# from fdfs_client.client import get_tracker_conf, Fdfs_client

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

cnx = baseCore.cnx
cursor = baseCore.cursor

cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
# tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf)

taskType = '企业公告/证监会'


def secrchATT(item_id, name, type_id):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
    cursor_.execute(sel_sql, (item_id, name, type_id))
    selects = cursor_.fetchone()
    return selects


# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    selects = secrchATT(item_id, pdf_name, type_id)

    if selects:
        log.info(f'com_name:{com_name}已存在')
        id = selects[0]
        return id
    else:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, page_size)

        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
        log.info("更新完成:{}".format(Upsql))
        selects = secrchATT(item_id, pdf_name, type_id)
        id = selects[0]
        return id

def RequestUrl(url, payload, social_code,start_time):
    # ip = get_proxy()[random.randint(0, 3)]

    for m in range(0, 3):
        try:
            response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
            response.encoding = response.apparent_encoding
            break
        except Exception as e:
            log.error(f"request请求异常----{m}-----{e}")
            pass

    # 检查响应状态码
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        pass
    else:
        # 请求失败，输出错误信息
        log.error('请求失败:', url)
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
        soup = ''
    return soup

def getUrl(code, url_parms, Catagory2_parms):
    # 深市
    if code[0] == '2' or code[0] == '0' or code[0] == '3':
        url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
        Catagory2 = Catagory2_parms[1]
        # 构建POST请求的参数，prodType --- 股票代码
        payload2 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selBoardCode0': '',
            'selBoardCode': ''
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload2
        }
    # 沪市
    if code[0] == '9' or code[0] == '6':
        url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
        Catagory2 = Catagory2_parms[0]
        payload1 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selCatagory3': '',
            'selBoardCode0': '',
            'selBoardCode': '',
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload1
        }

    # 北交所
    if code[0] == '8' or code[0] == '4':
        try:
            url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
        except:
            return
        Catagory2 = Catagory2_parms[2]
        payload3 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}'
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload3
        }
    return dic_parms


def InsterInto(short_name, social_code, pdf_url):
    inster = False

    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    if selects:
        print(f'com_name:{short_name}、{pdf_url}已存在')
        return inster

    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''

        list_info = [
            social_code,
            pdf_url,
            '证监会',
            '1',
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
        cnx.commit()
        insert = True
        return insert
    except:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert


def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
    #上传至文件服务器
    retData = baseCore.upLoadToServe(pdf_url,8,social_code)
    #附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
    num = num + 1
    att_id = tableUpdate(retData,com_name,year,pdf_name,num)
    content = retData['content']
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False

    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': att_id,
        'author': '',
        'content': content,
        'contentWithTag': '',
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': 'zh',
        'origin': '证监会',
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': pdf_url,  # 原文链接
        'summary': '',
        'title': pdf_name,
        'type': 3,
        'socialCreditCode': social_code,
        'year': year
    }
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        print(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
        print(dic_result)
        return False


# 采集信息
def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库中获取到的基本信息
    okCount = 0
    errorCount = 0
    social_code = dic_info[2]
    short_name = dic_info[4]
    com_name = dic_info[1]

    soup = RequestUrl(url, payload, social_code, start_time)
    if soup == '':
        return
    # 判断查找内容是否存在
    try:
        is_exist = soup.find('div',class_='con').text
        if is_exist == '没有查询到数据':
            state = 1
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, url, '')
            return
    except:
        pass

    # 先获取页数
    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text

    total = re.findall(r'\d+', page)[0]

    r_page = int(total) % 15
    if r_page == 0:
        Maxpage = int(total) // 15
    else:
        Maxpage = int(total) // 15 + 1
    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
    # 首页和其他页不同，遍历 如果是首页 修改一下链接
    for i in range(1, Maxpage + 1):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
        else:
            # http://eid.csrc.gov.cn/101811/index_3_f.html
            href = url.split('index')[0] + f'index_{i}_f.html'

        soup = RequestUrl(href, payload, social_code, start_time)
        if soup == '':
            continue
        tr_list = soup.find('div', id='txt').find_all('tr')
        pageIndex = 0
        for tr in tr_list[1:]:
            pageIndex += 1
            td_list = tr.find_all('td')
            pdf_url_info = td_list[2]
            # print(pdf_url)
            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'')

            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
            year = pub_time[:4]
            report_type = td_list[4].text.strip()

            # 信息插入数据库
            insert = InsterInto(short_name, social_code, name_pdf)

            if insert:
                #     # 公告信息列表
                #     okCount = okCount + 1
                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
                log.info(f'======={short_name}========{code}===插入公告库成功')
                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)

                if result:
                    # 公告信息列表
                    okCount = okCount + 1
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
                    pass
                else:
                    errorCount += 1
                    # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    log.error(f'{short_name}=============解析或传输操作失败')
                    # try:
                    #     insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
                    #     cursor_.execute(insert_err_sql)
                    #     cnx_.commit()
                    # except:
                    #     pass
                    continue
            else:
                log.info(f'======={short_name}========{code}===已存在')
                continue

if __name__ == '__main__':
    num = 0
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Length': '380',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
        'Host': 'eid.csrc.gov.cn',
        'Origin': 'http://eid.csrc.gov.cn',
        'Pragma': 'no-cache',
        'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    }

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
        'Host': 'static.sse.com.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }

    dic_parms = {}
    # 读取数据库获取股票代码 简称 以及 社会信用代码
    while True:
        start_time = time.time()
        # 获取企业信息
        social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
        # social_code = '9110000071092841XX'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        count = dic_info[16]
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html

        # 发行上市公告,北交所没有该栏目
        url_parms = ['101110', '101810']
        Catagory2_parms = ['9603', '10057']
        # 临时报告
        url_parms_ls = ['101112', '101812', '102612']
        Catagory2_parms_ls = ['9605', '10059', '10163']

        # 根据股票代码选链接
        # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，4、8开头的为北京交易所
        code = dic_info[3]
        short_name = dic_info[4]
        com_name = dic_info[1]
        dic_parms = getUrl(code, url_parms, Catagory2_parms)
        dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
        if len(dic_parms) > 0:
            start_time_cj = time.time()
            SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
            start_time_ls = time.time()
            SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
            # UpdateInfoSql(retData,retData_ls,social_code)
            # log.info(f'{code}================更新成功')
            end_time = time.time()
            log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
            count += 1
            runType = 'NoticeReportCount'
            baseCore.updateRun(code, runType, count)

    cursor.close()
    cnx.close()
    # cursor_.close()
    # cnx_.close()
    # 释放资源
    baseCore.close()
