提交 864508c6 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

......@@ -27,7 +27,7 @@ headers = {
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}
taskType = '企业动态/新浪财经'
taskType = '企业动态/新浪财经/国内'
pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
# 获取响应页面
......
......@@ -28,7 +28,7 @@ headers = {
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}
taskType = '企业动态/新浪财经'
taskType = '企业动态/新浪财经/香港'
# 判断时间是否是正确格式
......@@ -51,7 +51,7 @@ def format_time(time_str):
def getrequests(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers,proxies=ip)
req.encoding = req.apparent_encoding
req.encoding = 'gbk'
soup = BeautifulSoup(req.text, 'html.parser')
return soup
......@@ -117,7 +117,7 @@ def getDic(social_code, title, href, pub_time):
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
# return 1
return 1
# 数据发送至Kafka
......@@ -165,77 +165,77 @@ def selectUrl(url, social_code):
def doJob():
# while True:
start_time = time.time()
# social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
social_code = '91330000747735638J'
if not social_code or social_code == 'None':
time.sleep(20)
data = baseCore.getInfomation(social_code)
gpdm = data[3]
log.info(f'{social_code}==={gpdm}===开始采集')
# if gpdm == '' or not gpdm:
# log.error(f'{social_code}===股票代码为空')
# continue
gpdm_ = gpdm.split('.')[0]
if len(gpdm_) != 5:
gpdm_ = gpdm_.zfill(5)
page = 1
num_ok = 0
num_error =0
while True:
url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
soup = getrequests(url)
if '拒绝访问' in soup.text:
log.error(f'{social_code}===ip封禁')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
# r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
time.sleep(1800)
break
next_flg = soup.find('div',class_='part02').text
if '暂无数据' in next_flg:
break
try:
li_list = soup.find('ul', class_='list01').find_all('li')
for li in li_list:
try:
a = li.find('a')
if a:
title = a.text
if title == '':
continue
href = a.get('href')
selects = selectUrl(href,social_code)
if selects:
log.info(f'{href}===已采集过')
continue
pub_time = format_time(li.find('span').text)
print(title)
flag = getDic(social_code,title,href,pub_time)
if flag == 1:
num_ok += 1
else:
num_error += 1
time.sleep(0.5)
except Exception as e:
ee = e.__traceback__.tb_lineno
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
continue
# 增量使用
# if selects:
# break
except:
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
page += 1
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
start_time = time.time()
social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
# social_code = '91330000747735638J'
if not social_code or social_code == 'None':
time.sleep(20)
data = baseCore.getInfomation(social_code)
gpdm = data[3]
log.info(f'{social_code}==={gpdm}===开始采集')
# if gpdm == '' or not gpdm:
# log.error(f'{social_code}===股票代码为空')
# continue
gpdm_ = gpdm.split('.')[0]
if len(gpdm_) != 5:
gpdm_ = gpdm_.zfill(5)
page = 1
num_ok = 0
num_error =0
while True:
url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
soup = getrequests(url)
if '拒绝访问' in soup.text:
log.error(f'{social_code}===ip封禁')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
# r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
time.sleep(1800)
break
next_flg = soup.find('div',class_='part02').text
if '暂无数据' in next_flg:
break
try:
li_list = soup.find('ul', class_='list01').find_all('li')
for li in li_list:
try:
a = li.find('a')
if a:
title = a.text
if title == '':
continue
href = a.get('href')
selects = selectUrl(href,social_code)
if selects:
log.info(f'{href}===已采集过')
continue
pub_time = format_time(li.find('span').text)
print(title)
flag = getDic(social_code,title,href,pub_time)
if flag == 1:
num_ok += 1
else:
num_error += 1
time.sleep(0.5)
except Exception as e:
ee = e.__traceback__.tb_lineno
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
continue
# 增量使用
# if selects:
# break
except:
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
page += 1
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论