提交 0ef8e52c 作者: XveLingKun

华尔街日报多个信息源采集

上级 9c3eea0f
...@@ -28,7 +28,7 @@ def create_driver(): ...@@ -28,7 +28,7 @@ def create_driver():
driver = webdriver.Edge(service=edge_service, options=edge_options) driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver return driver
def get_pagesource(): def get_pagesource(url):
driver = create_driver() driver = create_driver()
# un = 'zhk2058@163.com' # un = 'zhk2058@163.com'
# pw = 'ZZM205899' # pw = 'ZZM205899'
...@@ -44,7 +44,8 @@ def get_pagesource(): ...@@ -44,7 +44,8 @@ def get_pagesource():
# driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click() # driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
# time.sleep(3) # time.sleep(3)
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section' # url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
# url = 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section'
driver.get(url) driver.get(url)
time.sleep(3) time.sleep(3)
while True: while True:
...@@ -62,8 +63,8 @@ def get_pagesource(): ...@@ -62,8 +63,8 @@ def get_pagesource():
continue continue
return soup, driver return soup, driver
def get_newshref(key): def get_newshref(key, url):
soup, driver = get_pagesource() soup, driver = get_pagesource(url)
if soup: if soup:
pass pass
else: else:
...@@ -107,24 +108,46 @@ def get_newshref(key): ...@@ -107,24 +108,46 @@ def get_newshref(key):
def caiji(): def caiji():
redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6) redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
key = 'WSJ:NewsInfo' key = 'WSJ:NewsInfo'
news_list, driver = get_newshref(key) url_list = ['https://cn.wsj.com/',
# #todo:将获取到的列表全部放进redis等待 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
# 'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
# count = 0 'https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section',
# time.sleep(10) 'https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section',
# 开始一个pipeline 'https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section',
pipeline = redis_client.pipeline() 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section']
for idx, info in enumerate(news_list): for url in url_list:
# href = info['newsUrl'] if url == 'https://cn.wsj.com/':
# title = info['title'] print('正在采集WSJ首页...')
# summary = info['summary'] elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
# publishDate = info['publishDate'] print('正在采集WSJ国际新闻...')
# 存入 redis elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
hash_key = f'{key}:{idx}' print('正在采集WSJ中国新闻...')
pipeline.hset(hash_key, mapping=info) elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
print('正在采集WSJ金融市场...')
# 执行pipeline elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
pipeline.execute() print('正在采集WSJ经济新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
print('正在采集WSJ商业新闻...')
else:
print('正在采集WSJ科技新闻...')
news_list, driver = get_newshref(key, url)
# #todo:将获取到的列表全部放进redis等待
#
# count = 0
# time.sleep(10)
# 开始一个pipeline
pipeline = redis_client.pipeline()
for idx, info in enumerate(news_list):
# href = info['newsUrl']
# title = info['title']
# summary = info['summary']
# publishDate = info['publishDate']
# 存入 redis
hash_key = f'{key}:{idx}'
pipeline.hset(hash_key, mapping=info)
# 执行pipeline
pipeline.execute()
# driver.get(href) # driver.get(href)
# time.sleep(3) # time.sleep(3)
...@@ -150,8 +173,8 @@ def wsj_list_task(): ...@@ -150,8 +173,8 @@ def wsj_list_task():
pass pass
if __name__ == '__main__': if __name__ == '__main__':
wsj_list_task() # wsj_list_task()
caiji()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论