from bs4 import BeautifulSoup
import requests, time, json
import redis,random
from kafka import KafkaProducer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from apscheduler.schedulers.blocking import BlockingScheduler
import sys
sys.path.append("../../base")
from base import BaseCore
log = BaseCore.BaseCore().getLogger()

def create_driver():
    ip = {
        'https': 'https://127.0.0.1:1080',
        'http': 'http://127.0.0.1:1080'

    }
    edge_service = Service(r'D:\soft\msedgedriver.exe')
    edge_options = Options()
    # 开启开发者模式
    edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 禁用启用Blink运行时的功能
    edge_options.add_argument('--disable-blink-features=AutomationControlled')
    edge_options.add_argument('--proxy-server=%s' % ip['http'])
    # prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
    # edge_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Edge(service=edge_service, options=edge_options)
    return driver

def create_google():
    driver_path = r'D:\cmd100\chromedriver.exe'
    chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
    chrome_driver = driver_path
    path = Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.binary_location = chromr_bin
    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
    return driver

def login():
    driver = create_google()
    un = 'zhk2058@163.com'
    pw = 'ZZM205899'
    driver.get(
        "https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
    time.sleep(5)
    driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
    # //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
    driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
    time.sleep(3)
    # //*[@id="password-login"]/div/form/div[5]/button
    driver.find_element(By.ID, "password-login-password").send_keys(pw)
    driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
    time.sleep(3)
    cookies = driver.get_cookies()
    return cookies, driver

def parser_content(href, driver):

    while True:
        driver.get(href)
        time.sleep(2)
        news_soup = BeautifulSoup(driver.page_source, 'html.parser')
        news_content = news_soup.find('div', class_='article-content')
        if news_content is None:
            driver.refresh()
            time.sleep(3)
            log.info('封号')
            return None, None
        else:
            break
    content = news_content.text
    return content, news_content

def getData(key):

    keys = r.scan_iter(f"{key}*")
    for key in keys:

        if 'WSJ:NewsInfo_sy' in key.decode():
            sid = '1780483604239781890'
            info_code = "IN-20240417-0078"
            origin = "华尔街日报中文网-首页"
        elif "WSJ:NewsInfo_world" in key.decode():
            sid = '1780484012605607937'
            info_code = "IN-20240417-0081"
            origin = "华尔街日报中文网-国际"
        elif "WSJ:NewsInfo_china" in key.decode():
            sid = '1780484750069108737'
            info_code = "IN-20240417-0084"
            origin = "华尔街日报中文网-中国"
        elif "WSJ:NewsInfo_markets" in key.decode():
            sid = '1780489030450884609'
            info_code = "IN-20240417-0085"
            origin = "华尔街日报中文网-金融市场"
        elif "WSJ:NewsInfo_economy" in key.decode():
            sid = '1780489531269484545'
            info_code = "IN-20240417-0086"
            origin = "华尔街日报中文网-经济"
        elif "WSJ:NewsInfo_business" in key.decode():
            sid = '1780489708428496897'
            info_code = "IN-20240417-0087"
            origin = "华尔街日报中文网-商业"
        else:
            sid = '1775455062911447042'
            info_code = "IN-20240403-0041"
            origin = "华尔街日报中文网-科技"

        fields = r.hgetall(key)
        decode_fields = {k.decode(): v.decode() for k, v in fields.items()}
        # 获取一条信息

        r.delete(key)
        print(f"删除成功{key}")
        newsUrl = decode_fields['newsUrl']
        # todo: 判断是否已采集
        try:
            flag = r_2.sismember(info_code, newsUrl)
            if flag:
                log.info('信息已采集入库过')
                continue
        except Exception as e:
            continue
        publishDate = decode_fields['publishDate']
        title = decode_fields['title']
        summary = decode_fields['summary']
        # todo:发送kafka

        dic_news = {
            'content': '',
            'contentWithTag': '',
            'id': '',
            'summary': summary,
            'origin': origin,
            'publishDate': publishDate,
            'sid': sid,
            'sourceAddress': newsUrl,
            'title': title,
            'source': '16',
            'type': ''
        }

        # 将相应字段通过kafka传输保存
        try:
            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
            kafka_result = producer.send("crawlerInfo",
                                         json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

            log.info(kafka_result.get(timeout=10))
            dic_result = {
                'success': 'ture',
                'message': '操作成功',
                'code': '200',
            }
            log.info(dic_result)
            r_2.sadd(info_code, newsUrl)
        except Exception as e:
            log.info(e)
            log.info(f'传输失败：{dic_news["title"]}、{dic_news["publishDate"]}')

        # 不用对内容做处理
        # content, contentWithTag = parser_content(newsUrl, driver)
        # if content is None:
        #     time.sleep(20*60)
        # else:
        #     log.info(f'成功--{decode_fields}')
        #     r.delete(key)
        #     time.sleep(3)
    return True


if __name__ == '__main__':
    r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
    r_2 = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=5)
    key = 'WSJ:NewsInfo'
    # ip = {
    #     'https': 'https://127.0.0.1:1080',
    #     'http': 'http://127.0.0.1:1080'
    #
    # }
    # cookies, driver = login()
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    #     "content-type": "application/json; charset=UTF-8",
    #     "Connection": "keep-alive"
    #     }
    # with open('wsj_cookie.txt', 'r') as g:
    #     cookies = g.read()
    #     cookies = json.loads(cookies)
    #
    # # url = 'https://cn.wsj.com/articles/欧盟根据新数字竞争法对苹果-meta-谷歌展开调查-732a3d4f'
    # url = 'https://www.wsj.com/economy/china-industrial-profits-return-to-growth-d3530ec5'
    #
    # driver = create_driver()
    # for cookie in cookies:
    #     driver.add_cookie(cookie)
    # driver.get(url)
    while True:
        getData(key)
        time.sleep(60*60*1)

