提交 0ef8e52c 作者: XveLingKun

华尔街日报多个信息源采集

上级 9c3eea0f
...@@ -28,7 +28,7 @@ def create_driver(): ...@@ -28,7 +28,7 @@ def create_driver():
driver = webdriver.Edge(service=edge_service, options=edge_options) driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver return driver
def get_pagesource(): def get_pagesource(url):
driver = create_driver() driver = create_driver()
# un = 'zhk2058@163.com' # un = 'zhk2058@163.com'
# pw = 'ZZM205899' # pw = 'ZZM205899'
...@@ -44,7 +44,8 @@ def get_pagesource(): ...@@ -44,7 +44,8 @@ def get_pagesource():
# driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click() # driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
# time.sleep(3) # time.sleep(3)
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section' # url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
# url = 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section'
driver.get(url) driver.get(url)
time.sleep(3) time.sleep(3)
while True: while True:
...@@ -62,8 +63,8 @@ def get_pagesource(): ...@@ -62,8 +63,8 @@ def get_pagesource():
continue continue
return soup, driver return soup, driver
def get_newshref(key): def get_newshref(key, url):
soup, driver = get_pagesource() soup, driver = get_pagesource(url)
if soup: if soup:
pass pass
else: else:
...@@ -107,7 +108,29 @@ def get_newshref(key): ...@@ -107,7 +108,29 @@ def get_newshref(key):
def caiji(): def caiji():
redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6) redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
key = 'WSJ:NewsInfo' key = 'WSJ:NewsInfo'
news_list, driver = get_newshref(key) url_list = ['https://cn.wsj.com/',
'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section']
for url in url_list:
if url == 'https://cn.wsj.com/':
print('正在采集WSJ首页...')
elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
print('正在采集WSJ国际新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
print('正在采集WSJ中国新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
print('正在采集WSJ金融市场...')
elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
print('正在采集WSJ经济新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
print('正在采集WSJ商业新闻...')
else:
print('正在采集WSJ科技新闻...')
news_list, driver = get_newshref(key, url)
# #todo:将获取到的列表全部放进redis等待 # #todo:将获取到的列表全部放进redis等待
# #
# count = 0 # count = 0
...@@ -150,8 +173,8 @@ def wsj_list_task(): ...@@ -150,8 +173,8 @@ def wsj_list_task():
pass pass
if __name__ == '__main__': if __name__ == '__main__':
wsj_list_task() # wsj_list_task()
caiji()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论