import re
from datetime import datetime
from elasticsearch import Elasticsearch
import redis
import requests
from bs4 import BeautifulSoup
from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
index_name = 'researchreportdata'


def get_news(news_url,ip_dic):
    header = {
        'Host': 'www.sec.gov',
        'Connection': 'keep-alive',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
    }


    # response = requests.get(url=news_url, headers=header, verify=False, timeout=30)
    response = requests.get(url=news_url, headers=header, verify=False, proxies=ip_dic, timeout=30)
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        result = BeautifulSoup(response.content, 'html.parser')
        # print(result)
        # with open('wmt-20230131.html', 'w', encoding='utf-8')as f:
        #
        #     f.write(str(result))

    else:
        # 请求失败，输出错误信息
        print('请求失败:', response.status_code, response.text)
        state = 0
        result = ''
    return result


def updateaunn(index_name, id, publishDate: str = None, year:str = None, title:str = None):
    if title:
        body = {
            'doc': {
                'publishDate': publishDate,
                'year': year,
                'title': title
            }
        }
    else:
        body = {
            'doc': {
                'publishDate': publishDate,
            }
        }
    result = es.update(index=index_name
                            ,id=id
                            ,body=body)
    log.info('更新结果:%s' % result)


if __name__ == "__main__":
    # 测试：
    # 从redis中获取id title url 根据id 更新时间
    r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
    while True:
        item = r.lpop('NianbaoUS:id')
        if item:
            item = item.decode()
        else:
            break

        # news_url = "https://www.sec.gov/Archives/edgar/data/104169/000010416923000020/wmt-20230131.htm"
        # 测试：
        # item = "23101317365|MARSH & MCLENNAN COMPANIES, INC.:2021年年度报告|https://www.sec.gov/Archives/edgar/data/0000062709/000006270922000009/mmc-20211231.htm|2021|ZZSN230711140539905"

        id = item.split("|")[0]
        title = item.split("|")[1]
        news_url = item.split("|")[2]
        year = item.split("|")[3]
        socialCode = item.split("|")[4]
        ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        while True:
            count = 0
            result = get_news(news_url, ip_dic)
            if count <= 10 and result:
                break
            elif count > 10:
                # 记录没有解析成功的年报
                r.lpush('NianbaoUS:request_error', item)
                break
        if result:
            # with open('./wmt-20230131.html', 'r', encoding='utf-8') as f:
            #     html = f.read()
            # soup = BeautifulSoup(html, 'html.parser')
            try:
                publishDate = result.find('ix:nonnumeric', attrs={'format': 'ixt:date-monthname-day-year-en'}).text
                print(publishDate)
                # 解析时间
                publishDate = datetime.strptime(publishDate, '%B %d, %Y').strftime('%Y-%m-%d')
                new_month = publishDate[5:7]
                if int(new_month) != 12:
                    new_year = str(int(publishDate[:4]) - 1)
                    # todo: 更新发布日期
                    numbers = re.findall(r'\d{4}年', title)
                    title = title.replace(numbers[0], new_year + '年')
                    updateaunn(index_name, id, publishDate, new_year, title)
                else:
                    # 年份不用更新，只需更新日期
                    updateaunn(index_name=index_name, id=id, publishDate=publishDate)

                # 记录更新成功的企业年报
                r.lpush('NianbaoUS:success', f"{id}|{title}|{publishDate}|{socialCode}|{year}")

            except Exception as e:
                r.lpush('NianbaoUS:upodate_error', item)

        else:
            continue