import re
import time

import numpy as np
import pandas as pd
import requests
import os
import json
from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'Host': 'www.szse.cn',
    'Origin': 'http://www.szse.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Request-Type': 'ajax',
    'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/disc/announcement/annList'


# 获取代码列表
def getCodeList():
    code_list = []
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Host': 'www.szse.cn',
        'Pragma': 'no-cache',
        'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'X-Request-Type': 'ajax',
        'X-Requested-With': 'XMLHttpRequest',
    }
    url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    data_list = req.json()[0]['data']
    for data_ in data_list:
        code = re.findall('<u>(.*?)</u>', data_['sys_key'])[0]
        code_list.append(code)
    return code_list


# 获取总页数
def getPageSize(id):
    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
                 "pageNum": 1}
    data_post = json.dumps(data_post)
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    total = int(req.json()['announceCount'])
    if total % 50 == 0:
        pageSize = int(total / 50)
    else:
        pageSize = int(total / 50) + 1
    return pageSize


# 获取json数据
def getDataList(id, page):
    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
                 "pageNum": page}
    data_post = json.dumps(data_post)
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    data_list = req.json()['data']
    return data_list


# 获取pdf文件数据流
def getContent(href):
    ip = baseCore.get_proxy()
    req = requests.get(href, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    content = req.content
    return content


def doJob():
    if not os.path.exists('./市场板块/基金公告_2'):
        os.makedirs('./市场板块/基金公告_2')
    info_list = []
    code_list = getCodeList()
    for code in code_list:
        pageSize = getPageSize(code)
        for page in range(1, pageSize + 1):
            data_list = getDataList(code, page)
            for data in data_list:
                title = data['title']
                name = data['secName'][0]
                if not os.path.exists(f'./市场板块/基金公告_2/{code}-{name}'):
                    os.makedirs(f'./市场板块/基金公告_2/{code}-{name}')
                pub_time = data['publishTime']
                href = 'http://www.szse.cn/api/disc/info/download?id=' + data['id']
                info = [code, name, title, pub_time, href, '深圳交易所', 'http://www.szse.cn/disclosure/index.html']
                info_list.append(info)
                content = getContent(href)
                file = rf'./市场板块/基金公告_2/{code}-{name}/{title}-{pub_time[:10]}.pdf'
                if os.path.exists(file):
                    log.info(f'{title}===已采集')
                    time.sleep(3)
                    continue
                try:
                    with open(file, 'wb') as f:
                        f.write(content)
                    log.info(f'{title}===成功')
                except Exception as e:
                    log.error(f'第{page}页==={title}===失败')
                time.sleep(2)
    df = pd.DataFrame(np.array(info_list))
    df.columns = ['证券代码', '证券简称', '公告标题', '发布时间', '公告网址', '来源', '来源网址']
    df.to_excel('./市场板块/深圳交易所基金公告_2.xlsx', index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
