#coding=utf-8

import datetime
import json
import time

import pymysql
import requests
from kafka import KafkaProducer

from smart_extractor import SmartExtractor
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from base import BaseCore
basecore = BaseCore.BaseCore()
log = basecore.getLogger()
r = basecore.r


def reqmsg(url):
    header={
        'Connection':'keep-alive',
        #'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua-platform':'"Windows"',
        'Accept':'*/*',
        'Origin':'https://cn.tradingview.com',
        'Sec-Fetch-Site':'same-site',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Dest':'empty',
        'Referer':'https://cn.tradingview.com/',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9'
    }
    proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
    for i in range(0,3):
        try:
            response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
            searchmsg=response.json()
        except Exception as e:
            searchmsg=''
            log.info(f'{url}---请求失败--{e}')
        if searchmsg:
            log.info(f'{url}---请求成功')
            break
    return searchmsg

def reqDetailmsg(url):
    header={
        'Host':'cn.tradingview.com',
        'Connection':'keep-alive',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site':'none',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-User':'?1',
        'Sec-Fetch-Dest':'document',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cookie':'cookiePrivacyPreferenceBannerProduction=notApplicable; cookiesSettings={"analytics":true,"advertising":true}; _ga=GA1.1.153931157.1696599356; will_start_trial=1; device_t=MzBfV0F3OjA.5HeDqPHu8F5Ux85y2Bi3xCC-liNchYNYW1zUgqB5E4s; sessionid=rcy2dho7lh83k6tasjy4jjatig31tbdf; sessionid_sign=v1:K9a7nKtEZ3MWrJqUgqr9ZaVHrjlepGyPAoGrDmq2DiM=; _gcl_au=1.1.557075741.1696651024; png=f403f4d2-d955-4385-b59c-f2d74f7ec679; etg=f403f4d2-d955-4385-b59c-f2d74f7ec679; cachec=f403f4d2-d955-4385-b59c-f2d74f7ec679; tv_ecuid=f403f4d2-d955-4385-b59c-f2d74f7ec679; _ga_YVVRYGL0E0=deleted; __gads=ID=b0fa0efe8c0ccdc3:T=1696647286:RT=1696916773:S=ALNI_MaPEozJ_doJikuSMJ0r5yFDU3j_Mw; __gpi=UID=00000c59f5923a81:T=1696647286:RT=1696916773:S=ALNI_Ma-WnwGckO3mzIStdpHv1jmEDMMvA; _sp_ses.cf1a=*; _sp_id.cf1a=8a315f91-7829-4ad7-bf4b-151a217809dd.1696599355.14.1696924687.1696916773.00da5df6-3641-4999-a8cf-e2d01afa79e7; _ga_YVVRYGL0E0=GS1.1.1696924315.18.1.1696924691.38.0.0',

    }
    proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
    for i in range(0,3):
        try:
            response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
            htmltext=response.text
        except Exception as e:
            htmltext=''
            log.info(f'{url}---详情请求失败--{e}')
        if htmltext:
            log.info(f'{url}---详情请求成功')
            break
    return htmltext

def paserList(searchmsg,social_code):
    items=searchmsg['items']
    for item in items:
        try:
            id=item['id']
            title=item['title']
            storyPath='https://cn.tradingview.com'+item['storyPath']
            published=item['published']
            published=getFormatedate(published)

            #是否重复判断
            flag=selectLinkMsg(storyPath,social_code)
            if flag:
                log.info(f'{social_code}---{storyPath}---数据已采集过')
                continue
        except Exception as e:
            log.info(f'列表解析失败----{e}')
            continue
        try:
            source=item['source']
        except Exception as e:
            source=''
        try:
            link=item['link']
        except Exception as e:
            link=''
        try:
            symbol=item['relatedSymbols'][0]['symbol']
        except Exception as e:
            symbol=''
        try:
            # if link:
            #     sourceAddress=link
            # else:
            #     sourceAddress=storyPath
            sourceAddress=storyPath
            content,contentWithTag=extractorMsg(sourceAddress,title)
            if content:
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                detailmsg={
                    'content': content,
                    'contentWithTag': contentWithTag,
                    'createDate': time_now,
                    'publishDate': published,
                    'sourceAddress': sourceAddress,  # 原文链接
                    'summary': '',
                    'title': title,
                    'socialCreditCode': social_code,
                    'year': published[:4]
                }
                sendToKafka(detailmsg)
                saveLinkMsg(sourceAddress,social_code)
                log.info(f'信息发生kafka成功----{sourceAddress}')
            else:
                log.info(f'内容抽取失败----{sourceAddress}')
        except Exception as e:
            log.info(f'{social_code}____{sourceAddress}详情采集异常{e}')

def getFormatedate(timestamp):
    date = datetime.datetime.fromtimestamp(timestamp)
    formatted_date = date.strftime('%Y-%m-%d')
    return formatted_date

def createDriver():
    chrome_driver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
    path =  Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
    # 设置代理
    # proxy = "127.0.0.1:8080"  # 代理地址和端口
    # chrome_options.add_argument('--proxy-server=http://' + proxy)
    driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
    return driver

def extractorMsg(url,title):
    content=''
    contentWithTag=''
    lang=detect_language(title)
    sm=SmartExtractor(lang)
    raw_html=reqDetailmsg(url)
    if raw_html:
        try:
            soup=BeautifulSoup(raw_html,'html.parser')
            tdoc=soup.select('div[class="body-KX2tCBZq body-pIO_GYwT content-pIO_GYwT"]')[0]
            content=tdoc.text
            contentWithTag=str(tdoc)
        except Exception as e:
            log.info(f'抽取失败！！{e}')
        if content:
            log.info(f'抽取成功')
        else:
            try:
                article=sm.extract_by_html(raw_html)
                content=article.cleaned_text
                contentWithTag=article.text
            except Exception as e:
                log.info(f'抽取失败！！{e}')
        if content:
            log.info(f'抽取成功')
        else:
            try:
                article_content=paserDetail(raw_html,url)
                content=article_content['content']
                contentWithTag=article_content['body_html']
            except Exception as e:
                log.info(f'抽取失败！！{e}')
    else:
        driver=createDriver()
        driver.get(url)
        time.sleep(3)
        raw_html=driver.page_source
        try:
            article=sm.extract_by_html(raw_html)
            content=article.cleaned_text
            contentWithTag=article.text
        except Exception as e:
            log.info(f'抽取失败！！{e}')
        if content:
            log.info(f'抽取成功')
        else:
            try:
                article_content=paserDetail(raw_html,url)
                content=article_content['content']
                contentWithTag=article_content['body_html']
            except Exception as e:
                log.info(f'抽取失败！！{e}')
    return content,contentWithTag

#智能抽取
def paserDetail(detailhtml,detailurl):
    try:
        extractor = GeneralNewsExtractor()
        article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
    except:
        article_content={}

    return article_content
def detect_language(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text()
    # 使用langid.py判断文本的语言
    lang, confidence = langid.classify(text)
    return lang

def conn144():
    conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
                           charset='utf8')
    cursor = conn.cursor()
    return conn,cursor

def getStockFromSql():
    conn,cursor=conn144()
    # 检查记录是否存在
    select_sql=f"SELECT ticker,exchange,xydm FROM mgzqyjwyh_list "
    cursor.execute(select_sql)
    gn_result = cursor.fetchall()
    conn.commit()
    itemList=[]
    for item in gn_result:
        try:
            ticker=item[0]
            exchange=item[1]
            xydm=item[2]
            exchange=str(exchange).upper()
            param=exchange+':'+ticker+'_'+xydm
            r.rpush('tradview_ticker', param)
            itemList.append(param)
        except Exception as e:
            print(e)
    cursor.close()
    conn.close()

    return itemList

def sendToKafka(detailmsg):
    dic_news = {
        'attachmentIds': '',
        'author': '',
        'content': detailmsg['content'],
        'contentWithTag': detailmsg['contentWithTag'],
        'createDate': detailmsg['createDate'],
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': 'zh',
        'origin': 'Tradingview',
        'publishDate': detailmsg['publishDate'],
        'sid': '1711619846545776641',
        'sourceAddress': detailmsg['sourceAddress'],  # 原文链接
        'summary': '',
        'title': detailmsg['title'],
        'type': 2,
        'socialCreditCode': detailmsg['socialCreditCode'],
        'year': detailmsg['year']
    }
    producer=KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    try:
        kafka_result = producer.send("researchReportTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
        log.info(kafka_result.get(timeout=10))
    except Exception as e:
        log.info(f"发生kafka失败{e}")
    finally:
        producer.close()
#将连接保存到数据库
def saveLinkMsg(link,social_code):
    conn,cursor=conn144()
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
        # 动态信息列表
        time_format = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        list_info = [
            social_code,
            link,
            'Tradingview',
            '2',
            time_format
        ]
        cursor.execute(insert_sql, tuple(list_info))
    except Exception as e:
        log.info(f'{link}插入库中失败{e}')
    finally:
        conn.commit()
        cursor.close()
        conn.close()

#查询是否存在

def selectLinkMsg(link,social_code):
    flag=False
    conn,cursor=conn144()
    try:
        sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
        cursor.execute(sel_sql, (link, social_code))
        selects = cursor.fetchone()
        if selects:
            log.info(f'-----{social_code}----{link}:已经存在')
            flag=True
    except Exception as e:
        log.info(f'查询数据是否在库中失败{e}')
    finally:
        conn.commit()
        cursor.close()
        conn.close()
    return flag


if __name__ == '__main__':
    # url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
    # searchmsg=reqmsg(url)
    # print(searchmsg)
    # getStockFromSql()
    while True:
        try:
            tradview_ticker=r.lpop('tradview_ticker')
            if tradview_ticker:
                tradviewticker = tradview_ticker.decode(errors='ignore')
                ticker_param=str(tradviewticker).split('_')[0]
                social_code=str(tradviewticker).split('_')[1]
                url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={tradview_ticker}'
                searchmsg=reqmsg(url)
                paserList(searchmsg,social_code)
        except Exception as e:
            log.info(f'redis中获取企业信息为空{e}')
            break







