
from fdfs_client.client import get_tracker_conf, Fdfs_client

from bs4 import BeautifulSoup
import requests, re, time, pymysql,  fitz
import urllib3
from base import BaseCore

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

baseCore = BaseCore.BaseCore()
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx.cursor()

tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)

taskType = '企业年报/证监会'

# def get_proxy():
#     cursor = cnx_ip.cursor()
#     sql = "select proxy from clb_proxy"
#     cursor.execute(sql)
#     proxy_lists = cursor.fetchall()
#     ip_list = []
#     for proxy_ in proxy_lists:
#         ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
#     proxy_list = []
#     for str_ip in ip_list:
#         str_ip_list = str_ip.split('-')
#         proxyMeta = "http://%(host)s:%(port)s" % {
#             "host": str_ip_list[0],
#             "port": str_ip_list[1],
#         }
#         proxy = {
#             "HTTP": proxyMeta,
#             "HTTPS": proxyMeta
#         }
#         proxy_list.append(proxy)
#     return proxy_list

def RequestUrl(url, payload, item_id, start_time):
    # ip = get_proxy()[random.randint(0, 3)]

    response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
    response.encoding = response.apparent_encoding
    # 检查响应状态码
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        pass
    else:
        # 请求失败，输出错误信息
        print('请求失败:', response.status_code, response.text)
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(item_id, taskType, state, takeTime, url, '请求失败')
        soup = ''
    return soup


def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
                create_by, create_time, page_size):

    sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
    cursor_.execute(sel_sql, (item_id, year))
    selects = cursor_.fetchone()
    if selects:
        print(f'{name_pdf},{year}已存在')

    else:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
            create_by,
            create_time, page_size)

        cursor_.execute(Upsql, values)  # 插入
        cnx.commit()  # 提交
        print("更新完成:{}".format(Upsql))

# 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time):
    item_id = dic_info[2]
    # years = dic_info['call_year']
    short_name = dic_info[4]
    soup = RequestUrl(url, payload, item_id, start_time)
    if soup == '':
        return
    # 先获取页数

    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text

    total = re.findall(r'\d+', page)[0]
    r_page = int(total) % 15
    if r_page == 0:
        Maxpage = int(total) // 15
    else:
        Maxpage = int(total) // 15 + 1
    # 首页和其他页不同，遍历 如果是首页 修改一下链接
    for i in range(1, Maxpage + 1):
        if i == 1:
            href = url
        else:
            # http://eid.csrc.gov.cn/101811/index_3_f.html
            href = url.split('index')[0] + f'index_{i}_f.html'
        soup = RequestUrl(href, payload, item_id, start_time)
        if soup == '':
            continue
        tr_list = soup.find('div', id='txt').find_all('tr')
        for tr in tr_list[1:]:
            td_list = tr.find_all('td')
            pdf_url_info = td_list[2]
            # print(pdf_url)
            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')

            # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
            # print(name)
            report_type = td_list[4].text.strip()
            # print(report_type)
            if report_type == '年报':
                if '摘要' in name_pdf:
                    continue
                # 年份还从pdf名称里抽取
                try:
                    year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
                except Exception as e:
                    pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
                    year = int(pub_time) - 1
                    year = str(year)

                page_size = 0

                sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s'''
                cursor_.execute(sel_sql, (item_id, year))
                selects = cursor_.fetchone()
                if selects:
                    print(f'com_name:{short_name}、{year}已存在')
                    continue
                else:
                    # 类型为年报的话就解析该年报pdf，并入库
                    for i in range(0, 3):
                        try:
                            resp_content = requests.request("GET", pdf_url).content
                            # 获取pdf页数
                            with fitz.open(stream=resp_content, filetype='pdf') as doc:
                                page_size = doc.page_count
                            break
                        except Exception as e:
                            print(e)
                            time.sleep(3)
                            continue
                    if page_size < 1:
                        # pdf解析失败
                        print(f'==={short_name}、{year}===pdf解析失败')
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
                        continue
                    result = ''
                    for i in range(0, 3):
                        try:
                            result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
                            break
                        except Exception as e:
                            print(e)
                            time.sleep(3)
                            continue
                    if result == '':
                        e = '上传服务器失败'
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
                        continue

                    if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):

                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

                        type_id = '1'
                        item_id = dic_info['social_code']
                        group_name = 'group1'

                        path = bytes.decode(result['Remote file_id']).replace('group1', '')
                        full_path = bytes.decode(result['Remote file_id'])
                        category = 'pdf'
                        file_size = result['Uploaded size']
                        order_by = num
                        status = 1
                        create_by = 'XueLingKun'
                        create_time = time_now
                        page_size = page_size
                        try:
                            tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
                                        category, file_size, order_by, status, create_by, create_time, page_size)
                            state = 1
                            takeTime = baseCore.getTimeCost(start_time, time.time())
                            baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
                        except:
                            e = '数据库传输失败'
                            state = 0
                            takeTime = baseCore.getTimeCost(start_time, time.time())
                            baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
                        num = num + 1
                        time.sleep(2)
                    else:
                        e = '采集失败'
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
                        continue
            else:
                    continue


def getUrl(code, url_parms, Catagory2_parms):
    # 深市
    if code[0] == '2' or code[0] == '0' or code[0] == '3':
        url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
        Catagory2 = Catagory2_parms[1]
        # 构建POST请求的参数，prodType --- 股票代码
        payload2 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selBoardCode0': '',
            'selBoardCode': ''
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload2
        }
    # 沪市
    if code[0] == '9' or code[0] == '6':
        url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
        Catagory2 = Catagory2_parms[0]
        payload1 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selCatagory3': '',
            'selBoardCode0': '',
            'selBoardCode': '',
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload1
        }

    # 北交所
    if code[0] == '8' or code[0] == '4':
        try:
            url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
        except:
            return
        Catagory2 = Catagory2_parms[2]
        payload3 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}'
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload3
        }
    return dic_parms


if __name__ == '__main__':
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Length': '380',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
        'Host': 'eid.csrc.gov.cn',
        'Origin': 'http://eid.csrc.gov.cn',
        'Pragma': 'no-cache',
        'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    }

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
        'Host': 'static.sse.com.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }

    # 读取数据库获取股票代码 简称 以及 社会信用代码
    num = 1
    while True:
        start_time = time.time()
        # 获取企业信息
        social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
        if not social_code:
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        if social_code == '':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        count = dic_info[15]
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
        url_parms = ['101111', '101811', '102611']
        Catagory2_parms = ['9604', '10058', '10162']
        # 根据股票代码选链接
        # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，4、8开头的为北京交易所
        code = dic_info[3]
        dic_parms = getUrl(code, url_parms, Catagory2_parms)
        SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
        end_time = time.time()
        print(f'{dic_info["short_name"]} ---- 该企业耗时 ---- {end_time - start_time}')
        count += 1
        runType = 'AnnualReportCount'
        baseCore.updateRun(social_code, runType, count)

    cnx.close()
    cursor_.close()
    baseCore.close()



