import requests
from bs4 import BeautifulSoup
import re
import json
import redis
import time,datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from apscheduler.schedulers.blocking import BlockingScheduler

def create_driver():
    ip = {
        'https': 'https://127.0.0.1:1080',
        'http': 'http://127.0.0.1:1080'

    }
    edge_service = Service(r'D:\soft\msedgedriver.exe')
    edge_options = Options()
    # 开启开发者模式
    edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 禁用启用Blink运行时的功能
    edge_options.add_argument('--disable-blink-features=AutomationControlled')
    edge_options.add_argument('--proxy-server=%s' % ip['http'])
    # prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
    # edge_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Edge(service=edge_service, options=edge_options)
    return driver

def get_pagesource():
    driver = create_driver()
    # un = 'zhk2058@163.com'
    # pw = 'ZZM205899'
    # driver.get(
    #     "https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
    # time.sleep(5)
    # driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
    # # //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
    # driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
    # time.sleep(3)
    # # //*[@id="password-login"]/div/form/div[5]/button
    # driver.find_element(By.ID, "password-login-password").send_keys(pw)
    # driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
    # time.sleep(3)

    url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
    driver.get(url)
    time.sleep(3)
    while True:
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        try:
            scrip = soup.find('body').find('script').text
            # print(scrip)
            scrip = re.findall('__STATE__ =(.*);', scrip)[0].strip()
            break
        except:
            if soup.text == '':
                return None, driver
            time.sleep(3)
            continue
    return soup, driver

def get_newshref(key):
    soup, driver = get_pagesource()
    if soup:
        pass
    else:
        return None

    scrip = soup.find('body').find('script').text
    # print(scrip)
    scrip = re.findall('__STATE__ =(.*);', scrip)[0].strip()
    reqJson = json.loads(scrip)
    # print(reqJson)
    pattern = re.compile('article')
    keys = [key for key in reqJson['data'].keys() if pattern.match(key)]
    # filtered_data = {key: value for key, value in reqJson['data'].items() if pattern.match(key)}
    print(keys)
    news_list = []
    for key_ in keys:
        title = reqJson['data'][key_]['data']['data']['headline']
        # print(title)
        summary = reqJson['data'][key_]['data']['data']['summary']
        try:
            seoId = reqJson['data'][key_]['data']['data']['seoId']
        except:
            continue
        newsUrl = 'https://cn.wsj.com/articles/' + seoId
        print(newsUrl)
        timestamp = int(reqJson['data'][key_]['data']['data']['timestamp'])/1000
        publishDate = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

        dic_newsinfo = {
            'title': title,
            'summary': summary,
            'publishDate': publishDate,
            'newsUrl': newsUrl
        }
        # redis_client.hset(key, mapping=dic_newsinfo)

        news_list.append(dic_newsinfo)
    return news_list, driver


def caiji():
    redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
    key = 'WSJ:NewsInfo'
    news_list, driver = get_newshref(key)
    # #todo:将获取到的列表全部放进redis等待
    #
    # count = 0
    # time.sleep(10)
    # 开始一个pipeline
    pipeline = redis_client.pipeline()
    for idx, info in enumerate(news_list):
        # href = info['newsUrl']
        # title = info['title']
        # summary = info['summary']
        # publishDate = info['publishDate']
        # 存入 redis
        hash_key = f'{key}:{idx}'
        pipeline.hset(hash_key, mapping=info)

    # 执行pipeline
    pipeline.execute()

    #     driver.get(href)
    #     time.sleep(3)
    #     news_soup = BeautifulSoup(driver.page_source, 'html.parser')
    #     news_content = news_soup.find('div', class_='article-content')
    #     content = news_content.text
    #     print(f'{info["title"]}---已采集')
    #     count += 1
    # print(count)

#华尔街列表定时任务
def wsj_list_task():
    # 实例化一个调度器
    scheduler = BlockingScheduler()
    # 每天执行一次
    scheduler.add_job(caiji, 'cron', hour=9,minute=0,max_instances=1)
    try:
        # redisPushData  # 定时开始前执行一次
        # NewsEnterprise()
        scheduler.start()
    except Exception as e:
        print('定时采集异常', e)
        pass

if __name__ == '__main__':
    wsj_list_task()








