import json
import time

import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
from urllib import parse

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'qgqp_b_id=92f470109c2462c6c6aa5115d15f7b35; emshistory=%5B%22sz007sz%22%5D; qRecords=%5B%7B%22name%22%3A%22%u5929%u98CE%u8BC1%u5238%22%2C%22code%22%3A%22SH601162%22%7D%5D; HAList=ty-1-601766-%u4E2D%u56FD%u4E2D%u8F66%2Cty-116-03690-%u7F8E%u56E2-W%2Cty-0-002828-%u8D1D%u80AF%u80FD%u6E90%2Cty-1-601162-%u5929%u98CE%u8BC1%u5238%2Cty-0-000001-%u5E73%u5B89%u94F6%u884C%2Cty-0-002070-%u4F17%u548C%u9000%2Cty-1-600723-%u9996%u5546%u80A1%u4EFD%2Cty-0-300106-%u897F%u90E8%u7267%u4E1A%2Cty-116-00992-%u8054%u60F3%u96C6%u56E2%2Cty-0-300362-%u5929%u7FD4%u9000; st_si=06778540617554; st_pvi=44810095342512; st_sp=2023-07-18%2013%3A55%3A09; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=20230901152305423-113300304201-4533354410; st_asi=delete',
'Host': 'data.eastmoney.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
file_name = '研报--产业链.xlsx'
for page in range(100,101):
    log.info(f'----开始采集第{page}页-------')
    param = {"uid":"",
             "keyword":"产业链",
             "type":["researchReport"],
             "client":"web",
             "clientVersion":"curr",
             "clientType":"web",
             "param":{"researchReport":{"client":"web","pageSize":10,"pageIndex":page}}
             }
    param_url = parse.quote(str(param).replace(" ", ""))
    t = int(time.time() * 1000)
    url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
    res = requests.get(url).text[1:-1]
    res_json = json.loads(res)
    list_all = res_json['result']['researchReport']
    # print(list_all)

    if list_all:
        pass
    else:
        continue
    num = 1
    for one_news in list_all:
        log.info(f'---------开始采集第{num}条---------')
        dataList = []
        com_name = one_news['stockName']
        title = str(one_news['title']).replace('<em>','').replace('</em>','')
        date = one_news['date'][:10]
        code = one_news['code']
        href = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={code}'
        # print(date,href)
        #newsContent
        req = requests.get(href,headers=headers,verify=False,timeout=30)
        soup = BeautifulSoup(req.content,'html.parser')
        content = soup.find('div',class_='newsContent')
        try:
            pdf_url = soup.find('div',class_='report-infos').find_all('span')[4].find('a')['href']
        except:
            log.info(f'-----{href}-----')
            continue
        #.find_all('span')[-1].find('a')['href']
        log.info(pdf_url)
        dic_info = {
            '公司名称':com_name,
            '标题':title,
            '正文':content.text.strip(),
            '附件链接':pdf_url,
            '发布时间':date,
            'contentwithTag':content.prettify()
        }
        # print(dic_info)
        dataList.append(dic_info)
        baseCore.writerToExcel(dataList,file_name)
        num+=1
    # break
