import json
import re
import time

import numpy as np
import pandas as pd
import requests
import os
from base import BaseCore

baseCore = BaseCore.BaseCore()

log = baseCore.getLogger()
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'query.sse.com.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.sse.com.cn/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


# 获取json数据
def getJson(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    data_json = re.findall('\((.*)\)', req.text)[0]
    data_json = json.loads(data_json)
    req.close()
    return data_json


# 获取总页数
def getTotal():
    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
    data_json = getJson(url)
    total = int(data_json['pageHelp']['pageCount'])
    return total


# 获取pdf文件的基本信息
def getDataList(page):
    info_list = []
    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&_={int(time.time())}'
    data_json = getJson(url)['result']
    for data in data_json:
        name = data['fundExtAbbr']
        title = data['title']
        pub_time = data['sseDate']
        code = data['securityCode']
        href = 'http://www.sse.com.cn' + data['url'].replace('\\', '')
        info_list.append([title, pub_time, href, name, code])
    return info_list


# 获取pdf文件数据流
def getContent(href):
    ip = baseCore.get_proxy()
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'www.sse.com.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    }
    req = requests.get(href, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    content = req.content
    req.close()
    return content


def doJob():
    data_list = []
    total = getTotal()
    for page in range(1, total + 1):
        info_list = getDataList(page)
        for info in info_list:
            title = info[0]
            pub_time = info[1]
            href = info[2]
            name = info[3]
            code = info[4]
            data_list.append([code,name,title,pub_time,href,'上海交易所','http://www.sse.com.cn/reits/announcements/'])
            try:
                content = getContent(href)
            except:
                log.error(f'第{page}页==={title}===连接失败')
                continue
            file = f'./公告_2/{code}-{name}/{title}-{pub_time}.pdf'
            # num = 2
            # while True:
            #     flg = os.path.isfile(file)
            #     if flg:
            #         print(f'{title}===有重名')
            #         file = f'./公告/{code}-{name}/{title}-{num}.pdf'
            #         num += 1
            #     else:
            #         break
            try:
                try:
                    with open(file, 'wb') as f:
                        f.write(content)
                except:
                    try:
                        os.mkdir(f'./公告_2/{code}-{name}')
                        with open(file, 'wb') as f:
                            f.write(content)
                    except:
                        os.mkdir(f'./公告_2')
                        os.mkdir(f'./公告_2/{code}-{name}')
                        with open(file, 'wb') as f:
                            f.write(content)
                log.info(f'{title}===成功')
            except:
                log.error(f'第{page}页==={title}===保存失败')
    df = pd.DataFrame(np.array(data_list))
    df.columns = ['公募REITs代码','扩位简称','公告标题','披露日期','公告网址','来源','来源网址']
    df.to_excel('./上海交易所信息披露.xlsx',index=False)


if __name__ == '__main__':
    doJob()
    baseCore.close()
