提交 8da2fe7c 作者: XveLingKun

华尔街日报多个信息源采集

上级 0ef8e52c
...@@ -107,7 +107,7 @@ def get_newshref(key, url): ...@@ -107,7 +107,7 @@ def get_newshref(key, url):
def caiji(): def caiji():
redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6) redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
key = 'WSJ:NewsInfo'
url_list = ['https://cn.wsj.com/', url_list = ['https://cn.wsj.com/',
'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section', 'https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section',
'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section', 'https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section',
...@@ -118,18 +118,25 @@ def caiji(): ...@@ -118,18 +118,25 @@ def caiji():
for url in url_list: for url in url_list:
if url == 'https://cn.wsj.com/': if url == 'https://cn.wsj.com/':
print('正在采集WSJ首页...') print('正在采集WSJ首页...')
key = 'WSJ:NewsInfo_sy'
elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section': elif url =='https://cn.wsj.com/zh-hans/news/world?mod=nav_top_section':
print('正在采集WSJ国际新闻...') print('正在采集WSJ国际新闻...')
key = 'WSJ:NewsInfo_world'
elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section': elif url =='https://cn.wsj.com/zh-hans/news/china?mod=nav_top_section':
print('正在采集WSJ中国新闻...') print('正在采集WSJ中国新闻...')
key = 'WSJ:NewsInfo_china'
elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section': elif url =='https://cn.wsj.com/zh-hans/news/markets?mod=nav_top_section':
print('正在采集WSJ金融市场...') print('正在采集WSJ金融市场...')
key = 'WSJ:NewsInfo_markets'
elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section': elif url =='https://cn.wsj.com/zh-hans/news/economy?mod=nav_top_section':
print('正在采集WSJ经济新闻...') print('正在采集WSJ经济新闻...')
key = 'WSJ:NewsInfo_economy'
elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section': elif url =='https://cn.wsj.com/zh-hans/news/business?mod=nav_top_section':
print('正在采集WSJ商业新闻...') print('正在采集WSJ商业新闻...')
key = 'WSJ:NewsInfo_business'
else: else:
print('正在采集WSJ科技新闻...') print('正在采集WSJ科技新闻...')
key = 'WSJ:NewsInfo'
news_list, driver = get_newshref(key, url) news_list, driver = get_newshref(key, url)
# #todo:将获取到的列表全部放进redis等待 # #todo:将获取到的列表全部放进redis等待
# #
......
...@@ -83,16 +83,46 @@ def getData(key): ...@@ -83,16 +83,46 @@ def getData(key):
keys = r.scan_iter(f"{key}*") keys = r.scan_iter(f"{key}*")
for key in keys: for key in keys:
if 'WSJ:NewsInfo_sy' in key.decode():
sid = '1780483604239781890'
info_code = "IN-20240417-0078"
origin = "华尔街日报中文网-首页"
elif "WSJ:NewsInfo_world" in key.decode():
sid = '1780484012605607937'
info_code = "IN-20240417-0081"
origin = "华尔街日报中文网-国际"
elif "WSJ:NewsInfo_china" in key.decode():
sid = '1780484750069108737'
info_code = "IN-20240417-0084"
origin = "华尔街日报中文网-中国"
elif "WSJ:NewsInfo_markets" in key.decode():
sid = '1780489030450884609'
info_code = "IN-20240417-0085"
origin = "华尔街日报中文网-金融市场"
elif "WSJ:NewsInfo_economy" in key.decode():
sid = '1780489531269484545'
info_code = "IN-20240417-0086"
origin = "华尔街日报中文网-经济"
elif "WSJ:NewsInfo_business" in key.decode():
sid = '1780489708428496897'
info_code = "IN-20240417-0087"
origin = "华尔街日报中文网-商业"
else:
sid = '1775455062911447042'
info_code = "IN-20240403-0041"
origin = "华尔街日报中文网-科技"
fields = r.hgetall(key) fields = r.hgetall(key)
decode_fields = {k.decode(): v.decode() for k, v in fields.items()} decode_fields = {k.decode(): v.decode() for k, v in fields.items()}
# 获取一条信息 # 获取一条信息
# r.delete(key) r.delete(key)
# print(f"删除成功{key}") print(f"删除成功{key}")
newsUrl = decode_fields['newsUrl'] newsUrl = decode_fields['newsUrl']
# todo: 判断是否已采集 # todo: 判断是否已采集
try: try:
flag = r_2.sismember('IN-20240403-0041', newsUrl) flag = r_2.sismember(info_code, newsUrl)
if flag: if flag:
log.info('信息已采集入库过') log.info('信息已采集入库过')
continue continue
...@@ -102,15 +132,13 @@ def getData(key): ...@@ -102,15 +132,13 @@ def getData(key):
title = decode_fields['title'] title = decode_fields['title']
summary = decode_fields['summary'] summary = decode_fields['summary']
# todo:发送kafka # todo:发送kafka
sid = '1775455062911447042'
info_code = "IN-20240403-0041"
dic_news = { dic_news = {
'content': '', 'content': '',
'contentWithTag': '', 'contentWithTag': '',
'id': '', 'id': '',
'summary': summary, 'summary': summary,
'origin': '华尔街日报中文网-科技', 'origin': origin,
'publishDate': publishDate, 'publishDate': publishDate,
'sid': sid, 'sid': sid,
'sourceAddress': newsUrl, 'sourceAddress': newsUrl,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论