提交 0ef8e52c 作者: XveLingKun

华尔街日报多个信息源采集

上级 9c3eea0f
......@@ -28,7 +28,7 @@ def create_driver():
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def get_pagesource():
def get_pagesource(url):
driver = create_driver()
# un = 'zhk2058@163.com'
# pw = 'ZZM205899'
......@@ -44,7 +44,8 @@ def get_pagesource():
# driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
# time.sleep(3)
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
# url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
# url = 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section'
driver.get(url)
time.sleep(3)
while True:
......@@ -62,8 +63,8 @@ def get_pagesource():
continue
return soup, driver
def get_newshref(key):
soup, driver = get_pagesource()
def get_newshref(key, url):
soup, driver = get_pagesource(url)
if soup:
pass
else:
......@@ -107,24 +108,46 @@ def get_newshref(key):
def caiji():
redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
key = 'WSJ:NewsInfo'
news_list, driver = get_newshref(key)
# #todo:将获取到的列表全部放进redis等待
#
# count = 0
# time.sleep(10)
# 开始一个pipeline
pipeline = redis_client.pipeline()
for idx, info in enumerate(news_list):
# href = info['newsUrl']
# title = info['title']
# summary = info['summary']
# publishDate = info['publishDate']
# 存入 redis
hash_key = f'{key}:{idx}'
pipeline.hset(hash_key, mapping=info)
# 执行pipeline
pipeline.execute()
url_list = ['https://cn.wsj.com/',
'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section']
for url in url_list:
if url == 'https://cn.wsj.com/':
print('正在采集WSJ首页...')
elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
print('正在采集WSJ国际新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
print('正在采集WSJ中国新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
print('正在采集WSJ金融市场...')
elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
print('正在采集WSJ经济新闻...')
elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
print('正在采集WSJ商业新闻...')
else:
print('正在采集WSJ科技新闻...')
news_list, driver = get_newshref(key, url)
# #todo:将获取到的列表全部放进redis等待
#
# count = 0
# time.sleep(10)
# 开始一个pipeline
pipeline = redis_client.pipeline()
for idx, info in enumerate(news_list):
# href = info['newsUrl']
# title = info['title']
# summary = info['summary']
# publishDate = info['publishDate']
# 存入 redis
hash_key = f'{key}:{idx}'
pipeline.hset(hash_key, mapping=info)
# 执行pipeline
pipeline.execute()
# driver.get(href)
# time.sleep(3)
......@@ -150,8 +173,8 @@ def wsj_list_task():
pass
if __name__ == '__main__':
wsj_list_task()
# wsj_list_task()
caiji()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论