"""
打开SEC网址——【FILINGS】——【Company Filing】——输入证券代码——选10-K和20-F为年报
1. 根据美股代码 拿到企业对应的cik
2. 根据cik 拼接链接拿到json数据
3. 遍历json数组文件 拼接详情链接
4. 解析详情文章 通过kafka发送数据

"""
import json
import re
import time
from urllib.parse import urljoin

from base.BaseCore import BaseCore
baseCore = BaseCore()
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
from kafka import KafkaProducer
# from selenium import webdriver
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def paserUrl(html,listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html

def get_news(news_url,ip_dic):
    header = {
        'Host': 'www.sec.gov',
        'Connection': 'keep-alive',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
    }
    response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
    # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        result = BeautifulSoup(response.content,'html.parser')
        # print(result)
        pass
    else:
        # 请求失败，输出错误信息
        print('请求失败:', response.status_code, response.text)
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
        result = ''
    return result


def spider(com_name,cik,up_okCount):
    header = {
    'Host':'data.sec.gov',
    'Connection':'keep-alive',
    'Pragma':'no-cache',
    'Cache-Control':'no-cache',
    'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    'sec-ch-ua-mobile':'?0',
    'sec-ch-ua-platform':'"Windows"',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Sec-Fetch-Site':'none',
    'Sec-Fetch-Mode':'navigate',
    'Sec-Fetch-User':'?1',
    'Sec-Fetch-Dest':'document',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cookie':'_4c_=%7B%22_4c_s_%22%3A%22fVPLbtswEPwVg2fLJimSonwrUqDoIS1apO0xYMi1JcSRBIqx4hr%2B9%2B5act6pLiKHM8PR7urAhgoathKmzFWhpZFG2Dm7hX3PVgcW60CvHVsx4Zz2XOiMB6czJUXIrHZlBrAuxFob73PP5uwBvQoupNJalIXUxznz3eRxYL4NQF7lQtgFz9Y9KtJfRJTluOxiG%2B59uk77jmgD3Mz6cIsHAXa1h%2BuhDqkifW7ME1pBvakSwoWxhHaRKLga6ia0w2vVhD6qjCoRvYnt0AMpL6rY3sFMCCK3WAb256SgrBHWEOOJhru%2BThSzB7%2FYtLsJwNKNWDZiv2tCw%2Bzq4ifi354hPy6%2BX05QRxXOcbFtvduSKTZlzr58uv719fMpellqxctcLk6dMqUUVLD7uMXTKqWuXy2XwzAspjBLCBsXlz246Ktx7du7zjX7EUItNHRpFwMFB5%2FqthmD4%2F4q1psNxEtIVYsTgHsXamK4LVWYiBEC9PWGYgYqI%2B5uU9s9wsdxFjCtNsIYrqXEXifMa43i9BzH7z6NRv7E1kZyYXnxlj32KKPaQvMfqX0rDbA%2BD7IFl6t1YTLBwWaqUDIrC5Nn%2FMaALVTgXjj20lNK855nc7Z8Voun%2BbcoKxTy6i5NxKl3luc8z19yCSHu2dKxd8%2FjcLY6HyhFP%2BzDK4RqG1%2Ff%2BgH1ePwH%22%7D; _ga_300V1CHKH1=GS1.1.1694142118.3.0.1694142118.0.0.0; _ga=GA1.2.1399540932.1693469210; _gid=GA1.2.1824845345.1694142136; ak_bmsc=CB437E1B69906A01E58692EFBAA8A225~000000000000000000000000000000~YAAQ8BQgFyY6AFaKAQAAbKy9chWzUG2FvPYSvQ1oaw2RdgKemipNBxwFJPC71bps8Pe4B7LG80Yn8Gg+yVD84WX1d+lVZqdaPr8pbsd3N8NWzwiWUcN7PSoKK1Ej/G2WgOv8Nl0s2E8E8x/5XVYtGyFwKSl5mUGNsfsL4WYI++6imjaYHtyTDxtmKhvnWHMwXCMiJgqvRCr9yf5CeXKJuhpRrSZV/GZa8qlDr5PmF1LPu2RKv1jNRfLqq+BKaO4jKN8ETA0RUxhvXEpI1cc0bxFp9t/mD6iTVhzbxJ17qiBn9DLPcXoX1yheRONu9M//SyeHfETezU2RagRHONIPZXB2oN/8Qlu+Rjz9NIZk532RTj0qCSRu48EH8nmYFcwvGXb8YNhotygum3P+ELZSCzlgolFBQp+qciKBTsuJ3JL99/HMDHO9OyheN5yw6RH/hu6/xVW95acmV925q/yjoXITR+mcZWkrH4iRncHGQmwWQR+d+pNqeBYUNNm2; bm_sv=2C2708DF01ED851C6C481514DDA7F381~YAAQ8BQgF409AFaKAQAAFsm9chW4u/u6J8XhmAzFpGSqZr1ktVU8veuhu+tJ9h+G3Lf52nquY6mUDlkG1ZBMRAkAB3WCPBGWiKSbGR6sB29QOE9LOosBZKzL742Z5a0k6rOWyoByvjl75i7j68RIqGt0h87YwwLLqnH6gx6H0uqCkg+J405BKwHjvVhnQOF3eAD5CCbaJY5GQdS8bKDjOaX7e1WVr5aqdlNdEciyrs9hxhPZSPLLXuCFIDH+~1'
    }
    ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
    #正式
    url_json = f'https://data.sec.gov/submissions/CIK{cik}.json'
    #测试
    # url_json = 'https://data.sec.gov/submissions/CIK0001395064.json'

    #解析页面
    for nnn in range(0,4):
        try:
            req = requests.get(url=url_json,headers=header,proxies=ip_dic,verify=False,timeout=30)
            break
        except:
            time.sleep(2)
            continue
    try:
        data = req.json()
    except:
        baseCore.rePutIntoR('AnnualEnterprise:usqy_socialCode',social_code)
        return
    info = data['filings']['recent']
    form_type_list = info['form']
    accessionNumber_list = info['accessionNumber']
    primaryDocument_list = info['primaryDocument']
    filingDate_list = info['filingDate']

    i = 0
    for form in form_type_list:
        i += 1
        if form == '10-K' or form == '20-F':
            log.info(form,i-1)
            accessionNumber = accessionNumber_list[i-1]
            #发布日期
            filingDate = filingDate_list[i-1]
            year = filingDate[:4]
            u_1 = cik
            # u_1 = '1395064'
            u_2 = accessionNumber.replace('-','')
            u_3 = primaryDocument_list[i-1]
            news_url = 'https://www.sec.gov/Archives/edgar/data/' + u_1 + '/' + u_2 + '/' + u_3
            try:
                sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='1' '''
                cursor_.execute(sel_sql, (news_url, social_code))
            except Exception as e:
                print(e)
            selects = cursor_.fetchone()
            if selects:
                log.info(f'{cik}-----{social_code}----{news_url}:已经存在')
                #全量采集使用
                continue
            else:
                pass
            soup = get_news(news_url,ip_dic)
            if soup:
                pass
            else:
                continue
            #相对路径转化为绝对路径
            soup = paserUrl(soup,news_url)
            content = soup.text.strip()

            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            title = f'{com_name}:{year}年年度报告'

            log.info(f'---{title}----采集完成----发送数据----')
            dic_news = {
                'attachmentIds': '',
                'author': '',
                'content': content,
                'contentWithTag': str(soup),
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
                'keyWords': '',
                'lang': 'zh',
                'origin': 'SEC美国证券交易委员会',
                'publishDate': filingDate,
                'sid': '1684032033495392257',
                'sourceAddress': news_url,  # 原文链接
                'summary': '',
                'title': title,
                'type': 1,
                'socialCreditCode': '',
                'year': year
            }
            # print(dic_news)
            # 将相应字段通过kafka传输保存
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],compression_type='gzip',batch_size=1638400,linger_ms=1,buffer_memory=33445532*2,max_request_size=8388608)   #,batch_size=20480000,buffer_memory=64000000)
                kafka_result = producer.send("researchReportTopic",
                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

                log.info(kafka_result.get(timeout=10))

                dic_result = {
                    'success': 'ture',
                    'message': '操作成功',
                    'code': '200',
                }
                log.info(dic_result)
                try:
                    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
                    # 动态信息列表
                    up_okCount = up_okCount + 1
                    list_info = [
                        social_code,
                        news_url,
                        'SEC',
                        '1',
                    ]
                    cursor_.execute(insert_sql, tuple(list_info))
                    cnx_.commit()
                    # 采集一条资讯记录一条，记录该企业采到了多少的资讯
                    log.info(f'{social_code}----{news_url}:新增一条')
                except Exception as e:
                    log.error(f'传输失败:{social_code}----{news_url}-----{e}')
                    e = '数据库传输失败'
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, news_url, e)
                    continue
            except Exception as e:
                dic_result = {
                    'success': 'false',
                    'message': '操作失败',
                    'code': '204',
                    'e': e
                }
                log.info(f'{dic_result}---{e}')

def getrequest(social_code,url,headers,data):
    ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
    #通过请求post接口获取企业的CIK
    response = requests.post(url=url, headers=headers, data=data ,proxies=ip_dic)
    response.encoding = response.apparent_encoding
    # 检查响应状态码
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        result = response.json()
        # print(result)
        pass
    else:
        # 请求失败，输出错误信息
        print('请求失败:', response.status_code, response.text)
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
        result = ''
    return result

def getCIK(social_code,code):
    cik = ''
    #"MNSO" post请求 获取企业CIK
    payload = {"keysTyped":f"{code}","narrow":True}
    data = json.dumps(payload)
    result = getrequest(social_code,url,headers,data)
    #判断接口返回的数据哪一条是该企业 根据股票代码
    tickers = result['hits']['hits']
    if len(tickers) == 0:
        log.error(f'{code}....{social_code}....无hits')
        return cik
    for ticker in tickers:
        try:
            i_t_ = ticker['_source']['tickers']
            i_ts = i_t_.split(', ')
        except:
            continue
        for i_t in i_ts:
            if i_t == code:
                cik = ticker['_id']
                if len(cik) < 10:
                    cik = format(int(cik),'0>10d')
                    baseCore.updateCIK(social_code,cik)
                    break
        if cik != '':
            break
    if cik == '':
        log.error(f'{code}....{social_code}....无CIK')
    else:
        log.info(f'{code}....{social_code}....cik为{cik}')
    return cik


if __name__ == '__main__':
    headers = {
        'authority': 'efts.sec.gov',
        'method': 'POST',
        'path': '/LATEST/search-index',
        'scheme': 'https',
        'accept': '*/*',
        'accept-encoding': 'gzip deflate br',
        'accept-language': 'zh-CNzh;q=0.9en;q=0.8',
        'content-length': '34',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://www.sec.gov',
        'referer': 'https://www.sec.gov/',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    }
    url = 'https://efts.sec.gov/LATEST/search-index'
    num = 0
    taskType = '企业年报/SEC'
    while True:
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
        social_code = 'ZZSN230912210643024'
        if not social_code:
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        if social_code == '':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        count = dic_info[15]
        code = dic_info[3]
        com_name = dic_info[1]
        cik = dic_info[13]
        if code is None:
            exeception = '股票代码为空'
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
            continue
        if cik is None:
            cik = getCIK(social_code,code)
            if cik == '':
                exeception = 'cik为空'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
                continue
        # code = 'BP'
        # com_name = '英国石油公司'
        # cik = ''
        #"MNSO" post请求 获取企业CIK 正式
        # payload = {"keysTyped":f"{code}","narrow":True}
        # #测试
        # # payload = {"keysTyped": "BP", "narrow":True}
        # data = json.dumps(payload)
        # result = getrequest(social_code,url,headers,data)
        # # print(result)
        # #判断接口返回的数据哪一条是该企业 根据股票代码
        # tickers = result['hits']['hits']
        # for ticker in tickers:
        #     i_t = ticker['_source']['tickers']
        #     if i_t == code:
        #         cik = ticker['_id']
        #         print(cik)
        #         break
        # break
        up_okCount = 0
        try:
            spider(com_name,cik,up_okCount)
        except Exception as e:
            log.error(f'{social_code}----{e}--')
        break







