华尔街日报多个信息源采集

0ef8e52c · XveLingKun · 9c3eea0f · 0ef8e52c
--- a/comData/dingzhi/wsj-TECH.py
+++ b/comData/dingzhi/wsj-TECH.py
@@ -28,7 +28,7 @@ def create_driver():
    driver = webdriver.Edge(service=edge_service, options=edge_options)
    return driver

-def get_pagesource():
+def get_pagesource(url):
    driver = create_driver()
    # un = 'zhk2058@163.com'
    # pw = 'ZZM205899'
@@ -44,7 +44,8 @@ def get_pagesource():
    # driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
    # time.sleep(3)

-    url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
+    # url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
+    # url = 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section'
    driver.get(url)
    time.sleep(3)
    while True:
@@ -62,8 +63,8 @@ def get_pagesource():
            continue
    return soup, driver

-def get_newshref(key):
-    soup, driver = get_pagesource()
+def get_newshref(key, url):
+    soup, driver = get_pagesource(url)
    if soup:
        pass
    else:
@@ -107,7 +108,29 @@ def get_newshref(key):
 def caiji():
    redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
    key = 'WSJ:NewsInfo'
-    news_list, driver = get_newshref(key)
+    url_list = ['https://cn.wsj.com/',
+                'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
+                'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
+                'https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section',
+                'https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section',
+                'https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section',
+                'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section']
+    for url in url_list:
+        if url == 'https://cn.wsj.com/':
+            print('正在采集WSJ首页...')
+        elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
+            print('正在采集WSJ国际新闻...')
+        elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
+            print('正在采集WSJ中国新闻...')
+        elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
+            print('正在采集WSJ金融市场...')
+        elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
+            print('正在采集WSJ经济新闻...')
+        elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
+            print('正在采集WSJ商业新闻...')
+        else:
+            print('正在采集WSJ科技新闻...')
+        news_list, driver = get_newshref(key, url)
        # #todo:将获取到的列表全部放进redis等待
        #
        # count = 0
@@ -150,8 +173,8 @@ def wsj_list_task():
        pass

 if __name__ == '__main__':
-    wsj_list_task()
-
+    # wsj_list_task()
+    caiji()