华尔街日报多个信息源采集

0ef8e52c · XveLingKun · 9c3eea0f · 0ef8e52c
--- a/comData/dingzhi/wsj-TECH.py
+++ b/comData/dingzhi/wsj-TECH.py
@@ -28,7 +28,7 @@ def create_driver():
    driver = webdriver.Edge(service=edge_service, options=edge_options)
    return driver
-def get_pagesource():
+def get_pagesource(url):
    driver = create_driver()
    # un = 'zhk2058@163.com'
    # pw = 'ZZM205899'
@@ -44,7 +44,8 @@ def get_pagesource():
    # driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
    # time.sleep(3)
-    url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
+    # url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
+    # url = 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section'
    driver.get(url)
    time.sleep(3)
    while True:
@@ -62,8 +63,8 @@ def get_pagesource():
            continue
    return soup, driver
-def get_newshref(key):
+def get_newshref(key, url):
-    soup, driver = get_pagesource()
+    soup, driver = get_pagesource(url)
    if soup:
        pass
    else:
@@ -107,24 +108,46 @@ def get_newshref(key):
 def caiji():
    redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
    key = 'WSJ:NewsInfo'
-    news_list, driver = get_newshref(key)
+    url_list = ['https://cn.wsj.com/',
-    # #todo:将获取到的列表全部放进redis等待
+                'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
-    #
+                'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
-    # count = 0
+                'https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section',
-    # time.sleep(10)
+                'https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section',
-    # 开始一个pipeline
+                'https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section',
-    pipeline = redis_client.pipeline()
+                'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section']
-    for idx, info in enumerate(news_list):
+    for url in url_list:
-        # href = info['newsUrl']
+        if url == 'https://cn.wsj.com/':
-        # title = info['title']
+            print('正在采集WSJ首页...')
-        # summary = info['summary']
+        elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
-        # publishDate = info['publishDate']
+            print('正在采集WSJ国际新闻...')
-        # 存入 redis
+        elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
-        hash_key = f'{key}:{idx}'
+            print('正在采集WSJ中国新闻...')
-        pipeline.hset(hash_key, mapping=info)
+        elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
+            print('正在采集WSJ金融市场...')
-    # 执行pipeline
+        elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
-    pipeline.execute()
+            print('正在采集WSJ经济新闻...')
+        elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
+            print('正在采集WSJ商业新闻...')
+        else:
+            print('正在采集WSJ科技新闻...')
+        news_list, driver = get_newshref(key, url)
+        # #todo:将获取到的列表全部放进redis等待
+        #
+        # count = 0
+        # time.sleep(10)
+        # 开始一个pipeline
+        pipeline = redis_client.pipeline()
+        for idx, info in enumerate(news_list):
+            # href = info['newsUrl']
+            # title = info['title']
+            # summary = info['summary']
+            # publishDate = info['publishDate']
+            # 存入 redis
+            hash_key = f'{key}:{idx}'
+            pipeline.hset(hash_key, mapping=info)
+        # 执行pipeline
+        pipeline.execute()
    #     driver.get(href)
    #     time.sleep(3)
@@ -150,8 +173,8 @@ def wsj_list_task():
        pass
 if __name__ == '__main__':
-    wsj_list_task()
+    # wsj_list_task()
+    caiji()