提交 6da55a3e 作者: XveLingKun

微信公众号详情采集调整

上级 c12087c7
# -*- coding: utf-8 -*-
'''
成功100 发送数据失败200 请求失败400 文章内容为空500
成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700
'''
import re
......@@ -65,7 +65,7 @@ def getjsonInfo():
cnx_.commit()
return dict_json, linkid
@retry(tries=3,delay=2)
@retry(tries=20, delay=2)
def getrequest(url_news):
ip = baseCore.get_proxy()
......@@ -74,7 +74,7 @@ def getrequest(url_news):
raise
return res_news
def get_info(dict_json):
def get_info(dict_json, linkid):
# list_all_info = []
# num_caiji = 0
kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -91,25 +91,32 @@ def get_info(dict_json):
url_news = dict_json['link']
info_source_code = dict_json['info_source_code']
# try:
# ip = baseCore.get_proxy()
# res_news = requests.get(url_news, proxies=ip, timeout=20)
# except:
# #400请求失败
# updatewxLink(url_news, info_source_code, 400)
# return False
# while True:
# try:
# ip = baseCore.get_proxy()
# res_news = requests.get(url_news, proxies=ip, timeout=20)
# break
# except:
# time.sleep(3)
#400请求失败
# updatewxLink(url_news, info_source_code, 400)
# return False
# 修改请求方法,retry 3次
try:
res_news = getrequest(url_news)
# print(res_news)
except:
try:
res_news = requests.get(url_news, timeout=20)
except:
# 400请求失败
updatewxLink(url_news, info_source_code, 400)
return False
# 修改回原状态,重新放入redis
updatewxLink(url_news, info_source_code, 0)
log.info(f'{origin}---{news_date}--{news_title}---请求失败-- 重新放入redis')
baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
# try:
# res_news = requests.get(url_news, timeout=20)
# except:
# # 400请求失败
# updatewxLink(url_news, info_source_code, 400)
return False
soup_news = BeautifulSoup(res_news.content, 'html.parser')
try:
news_html = soup_news.find('div', {'id': 'js_content'})
......@@ -120,7 +127,11 @@ def get_info(dict_json):
del news_html['id']
del news_html['class']
except Exception as e:
log.error(f'{url_news}-----{info_source_code}')
log.info(f'--errorCode:700--{url_news}-----------{e}')
# log.error(f'{url_news}-----{info_source_code}')
updatewxLink(url_news, info_source_code, 0)
log.info(f'{origin}---{news_date}--{news_title}---style标签解析失败---重新放入redis')
baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
return False
try:
news_content = news_html.text
......@@ -137,7 +148,7 @@ def get_info(dict_json):
insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
cursor_.execute(insertSql, tuple(false))
cnx_.commit()
updatewxLink(url_news,info_source_code,500)
updatewxLink(url_news, info_source_code, 500)
return False
list_img = news_html.find_all('img')
......@@ -161,22 +172,33 @@ def get_info(dict_json):
except:
img_one.extract()
continue
resp = obsClient.putContent('zzsn', name_img, content=res.content)
resp = None
for i in range(10):
try:
resp = obsClient.putContent('zzsn', name_img, content=res.content)
break
except:
time.sleep(2)
if resp:
pass
else:
img_one.extract()
continue
try:
url_img = resp['body']['objectUrl']
str_url_img = f'<img src="{url_img}">'
except Exception as e:
log.info(f'--error--{url_news}-----------{e}')
log.info(f'--errorCode:300--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300)
return False
try:
img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
except Exception as e:
log.info(f'--error--{url_news}-----------{e}')
log.info(f'--errorCode:300--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300)
return False
except Exception as e:
log.info(f'--error--{url_news}-----------{e}')
log.info(f'--errorCode:600--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 600)
return False
......@@ -214,7 +236,7 @@ def get_info(dict_json):
except:
time.sleep(5)
log.info('------数据发送kafka失败------')
updatewxLink(url_news,info_source_code,200)
updatewxLink(url_news, info_source_code, 200)
continue
list_all_info.append(dic_info)
......@@ -237,7 +259,7 @@ def get_info(dict_json):
except:
time.sleep(5)
continue
updatewxLink(url_news,info_source_code,100)
updatewxLink(url_news, info_source_code, 100)
return True
def rm_style_attr(soup):
......@@ -277,7 +299,7 @@ if __name__=="__main__":
dict_json, linkid =getjsonInfo()
try:
if dict_json:
if get_info(dict_json):
if get_info(dict_json, linkid):
num_caiji = num_caiji + 1
log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论