提交 864508c6 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -27,7 +27,7 @@ headers = { ...@@ -27,7 +27,7 @@ headers = {
'Cache-Control': 'no-cache', 'Cache-Control': 'no-cache',
'Pragma': 'no-cache' 'Pragma': 'no-cache'
} }
taskType = '企业动态/新浪财经' taskType = '企业动态/新浪财经/国内'
pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}" pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
# 获取响应页面 # 获取响应页面
......
...@@ -28,7 +28,7 @@ headers = { ...@@ -28,7 +28,7 @@ headers = {
'Cache-Control': 'no-cache', 'Cache-Control': 'no-cache',
'Pragma': 'no-cache' 'Pragma': 'no-cache'
} }
taskType = '企业动态/新浪财经' taskType = '企业动态/新浪财经/香港'
# 判断时间是否是正确格式 # 判断时间是否是正确格式
...@@ -51,7 +51,7 @@ def format_time(time_str): ...@@ -51,7 +51,7 @@ def format_time(time_str):
def getrequests(url): def getrequests(url):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
req = requests.get(url, headers=headers,proxies=ip) req = requests.get(url, headers=headers,proxies=ip)
req.encoding = req.apparent_encoding req.encoding = 'gbk'
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
return soup return soup
...@@ -117,7 +117,7 @@ def getDic(social_code, title, href, pub_time): ...@@ -117,7 +117,7 @@ def getDic(social_code, title, href, pub_time):
# state = 0 # state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time()) # takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败') # baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
# return 1 return 1
# 数据发送至Kafka # 数据发送至Kafka
...@@ -165,77 +165,77 @@ def selectUrl(url, social_code): ...@@ -165,77 +165,77 @@ def selectUrl(url, social_code):
def doJob(): def doJob():
# while True:
start_time = time.time()
# social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
social_code = '91330000747735638J'
if not social_code or social_code == 'None':
time.sleep(20)
data = baseCore.getInfomation(social_code)
gpdm = data[3]
log.info(f'{social_code}==={gpdm}===开始采集')
# if gpdm == '' or not gpdm:
# log.error(f'{social_code}===股票代码为空')
# continue
gpdm_ = gpdm.split('.')[0]
if len(gpdm_) != 5:
gpdm_ = gpdm_.zfill(5)
page = 1
num_ok = 0
num_error =0
while True: while True:
url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml' start_time = time.time()
soup = getrequests(url) social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
if '拒绝访问' in soup.text: # social_code = '91330000747735638J'
log.error(f'{social_code}===ip封禁') if not social_code or social_code == 'None':
state = 0 time.sleep(20)
takeTime = baseCore.getTimeCost(start_time, time.time()) data = baseCore.getInfomation(social_code)
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁') gpdm = data[3]
# r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code) log.info(f'{social_code}==={gpdm}===开始采集')
time.sleep(1800) # if gpdm == '' or not gpdm:
break # log.error(f'{social_code}===股票代码为空')
next_flg = soup.find('div',class_='part02').text # continue
if '暂无数据' in next_flg: gpdm_ = gpdm.split('.')[0]
break if len(gpdm_) != 5:
try: gpdm_ = gpdm_.zfill(5)
li_list = soup.find('ul', class_='list01').find_all('li') page = 1
for li in li_list: num_ok = 0
try: num_error =0
a = li.find('a') while True:
if a: url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
title = a.text soup = getrequests(url)
if title == '': if '拒绝访问' in soup.text:
continue log.error(f'{social_code}===ip封禁')
href = a.get('href') state = 0
selects = selectUrl(href,social_code) takeTime = baseCore.getTimeCost(start_time, time.time())
if selects: baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
log.info(f'{href}===已采集过') # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
continue time.sleep(1800)
pub_time = format_time(li.find('span').text) break
print(title) next_flg = soup.find('div',class_='part02').text
flag = getDic(social_code,title,href,pub_time) if '暂无数据' in next_flg:
if flag == 1: break
num_ok += 1 try:
else: li_list = soup.find('ul', class_='list01').find_all('li')
num_error += 1 for li in li_list:
time.sleep(0.5) try:
except Exception as e: a = li.find('a')
ee = e.__traceback__.tb_lineno if a:
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}') title = a.text
state = 0 if title == '':
takeTime = baseCore.getTimeCost(start_time, time.time()) continue
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}') href = a.get('href')
continue selects = selectUrl(href,social_code)
# 增量使用 if selects:
# if selects: log.info(f'{href}===已采集过')
# break continue
except: pub_time = format_time(li.find('span').text)
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败") print(title)
state = 0 flag = getDic(social_code,title,href,pub_time)
takeTime = baseCore.getTimeCost(start_time, time.time()) if flag == 1:
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败') num_ok += 1
page += 1 else:
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条') num_error += 1
time.sleep(0.5)
except Exception as e:
ee = e.__traceback__.tb_lineno
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
continue
# 增量使用
# if selects:
# break
except:
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
page += 1
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论