# 雅虎财经企业动态获取
import json
import time
import pymysql
from kafka import KafkaProducer
from selenium.webdriver.common.by import By
import sys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from base import BaseCore
from base.smart import smart_extractor

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = baseCore.r

taskType = '企业动态/雅虎财经'
smart =smart_extractor.SmartExtractor('cn')

last_url = ''
# 获取资讯详情
def getZx(xydm, url, title, cnx, path):
    start_time_content = time.time()
    try:

        driverContent = baseCore.buildDriver(path)
        driverContent.get(url)
        try:
            clickButton = driverContent.find_element(By.CLASS_NAME, "collapse-button")
            clickButton.click()
        except Exception as e:
            pass
        time.sleep(0.5)

        authorElement = driverContent.find_element(By.CLASS_NAME, "caas-author-byline-collapse")

        timeElement = driverContent.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME,
                                                                                                     "time")

        contentElement = driverContent.find_element(By.CLASS_NAME, "caas-body").get_attribute('outerHTML')

        author = authorElement.text.lstrip().strip().replace("'", "''")

        pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ")
        pub_time = pub_time[0:19]
        content = contentElement.replace("'", "''")

        driverContent.close()
        # driverContent.quit()

        # 动态信息列表
        list_info = [
            xydm,
            title,
            '',
            content,
            pub_time,
            url,
            '雅虎财经',
            author,
            '2',
            'zh'
        ]

        try:
            insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
            cursor.execute(insert_sql, tuple(list_info))
            cnx.commit()

        except Exception as e1:
            log.error("保存数据库失败")
            exception = '数据库传输失败'
            return exception
        log.info(f"文章耗时，耗时{baseCore.getTimeCost(start_time_content, time.time())}")
        try:
            sel_sql = "select article_id from brpa_source_article where source_address = %s and social_credit_code = %s"
            cursor.execute(sel_sql, (url, social_code))
            row = cursor.fetchone()
            id = row[0]
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:插入一条数据，并传入kafka
            dic_news = {
                'attachmentIds': id,
                'author': '',
                'content': content,
                'contentWithTag': content,
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
                'keyWords': '',
                'lang': 'en',
                'origin': '雅虎财经',
                'publishDate': pub_time,
                'sid': '1684032033495392257',
                'sourceAddress': url,  # 原文链接
                'summary': '',
                'title': title,
                'type': 2,
                'socialCreditCode': social_code,
                'year': pub_time[:4]
            }
            # print(dic_news)
            # 将相应字段通过kafka传输保存
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                kafka_result = producer.send("researchReportTopic",
                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

                print(kafka_result.get(timeout=10))

                dic_result = {
                    'success': 'ture',
                    'message': '操作成功',
                    'code': '200',
                }
                log.info(dic_result)
                # 传输成功,写入日志中
                exception = ''
                return exception
                # return True
            except Exception as e:
                dic_result = {
                    'success': 'false',
                    'message': '操作失败',
                    'code': '204',
                    'e': e
                }
                log.error(dic_result)
                exception = 'Kafka操作失败'
                return exception
        except Exception as e:
            log.info(f'传输失败:{social_code}----{url}')
            exception = '数据id获取失败'
            return exception
    except Exception as e:
        log.error("获取正文失败")
        exception = '获取正文失败'
        return exception


def selectUrl(news_url,xydm):
    # with cnx.cursor() as cursor:
    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
    cursor.execute(sel_sql, (news_url,xydm))
    selects = cursor.fetchall()
    return selects


def getLastUrl():
    news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
    news_lis =  news_div.find_elements(By.XPATH,"./ul/li")
    last = len(news_lis)
    try:
        url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
    except:
        url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[1]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
    return url

def scroll(xydm,name,gpdm):
    last_url_ = ''
    while True:
        js = "var q=document.documentElement.scrollTop=100000"
        driver.execute_script(js)
        time.sleep(1)
        try:
            last_url = getLastUrl()
        except Exception as e:
            log.error(f"{name}--{gpdm}--获取不到最后一条链接")
            break
        # try:
        #     selects = selectUrl(last_url_,xydm)
        # except:
        #     break
        # if selects:
        #     break
        if last_url_ == last_url:
            break
        last_url_ = last_url

#采集失败的公众号 重新放入redis
def rePutIntoR(item):
    r.rpush('NewsEnterprise:gwqy_socialCode', item)


if __name__ == "__main__":
    path = r'F:\spider\115\chromedriver.exe'
    driver = baseCore.buildDriver(path)
    cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
    cursor = cnx.cursor()
    while True:

        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')

        # 判断 如果Redis中已经没有数据，则等待
        if not social_code :
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        data = baseCore.getInfomation(social_code)
        name = data[1]
        enname = data[5]
        gpdm = data[3]
        if 'HK' in str(gpdm):
            tmp_g = str(gpdm).split('.')[0]
            if len(tmp_g) == 5:
                gpdm = str(gpdm)[1:]
            else:
                pass
        xydm = data[2]

        # 获取该企业对应项目的采集次数
        count = data[17]
        start_time = time.time()
        if (gpdm == ''):
            log.error(f"{name}--股票代码为空 跳过")
            exception = '股票代码为空'
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
            continue
        try:
            url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
            driver.get(url)
            try:
                WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'summaryPressStream-0-Stream')))
                news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
                news_div.find_element(By.TAG_NAME, 'a')
            except Exception as e:
                log.error(f"{name}--{gpdm}--没找到新闻元素")
                exception = '没找到新闻元素'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
                continue
            try:
                scroll(xydm,name,gpdm)
            except Exception as e:
                print(e)
                log.error(f"{name}--{gpdm}--拖拽出现问题")
            news_lis = news_div.find_elements(By.XPATH, "./ul/li")
            log.info(f"{name}--{gpdm}--{len(news_lis)}条信息")

            #标识符 判断脚本是否断开连接
            flag = 0
            for i in range(0, len(news_lis)):
                try:
                    try:
                        a_ele = news_lis[i].find_element(By.XPATH, "./div[1]/div[1]/div[2]/h3[1]/a")
                    except:
                        a_ele = news_lis[i].find_element(By.XPATH, "./div[1]/div[1]/div[1]/h3[1]/a")
                except Exception as e:
                    if news_lis[i].is_displayed():
                        log.error(f"{name}--{gpdm}--{i}----a标签没找到")
                        exception = 'a标签没找到'
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
                        continue
                    else:
                        log.error(f"{name}--{gpdm}--{i}----与网站断开连接")
                        #todo:重新放入redis
                        rePutIntoR(xydm)
                        time.sleep(300)
                        flag = 1
                        break
                news_url = a_ele.get_attribute("href").lstrip().strip().replace("'", "''")
                if (news_url.startswith("https://finance.yahoo.com")):
                    pass
                else:
                    continue
                # 判断url是否已经存在
                sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
                cursor.execute(sel_sql, (news_url, xydm))
                selects = cursor.fetchall()
                if selects:
                    log.error(f"{name}--{gpdm}--网址已经存在----{news_url}")
                    exception = '网址已存在'
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
                    # 增量使用
                    # break
                    # 全量使用
                    continue
                title = a_ele.text.lstrip().strip().replace("'", "''")
                exception = getZx(xydm, news_url, title, cnx, path)
                if exception == '':
                    state = 1
                else:
                    state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
                log.info(f"{name}--{gpdm}--{i}----{news_url}")

            if flag==1:
                continue
            log.info(f"{name}--{gpdm}--企业整体，耗时{baseCore.getTimeCost(start_time, time.time())}")

            # 信息采集完成后将该企业的采集次数更新
            runType = 'NewsRunCount'
            count += 1
            baseCore.updateRun(social_code, runType, count)
        except:
            rePutIntoR(xydm)
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(xydm, taskType, state, takeTime, '', '远程主机强迫关闭了一个现有的连接。')
            log.info(f"-------{name}--{gpdm}---'远程主机强迫关闭了一个现有的连接。'--------")
            log.info('===========连接已被关闭========等待重新连接===========')
            driver.quit()
            driver = baseCore.buildDriver(path)
            time.sleep(5)
            continue

    cursor.close()
    cnx.close()
    # 释放资源
    baseCore.close()
