提交 6da55a3e 作者: XveLingKun

微信公众号详情采集调整

上级 c12087c7
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
''' '''
成功100 发送数据失败200 请求失败400 文章内容为空500 成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700
''' '''
import re import re
...@@ -65,7 +65,7 @@ def getjsonInfo(): ...@@ -65,7 +65,7 @@ def getjsonInfo():
cnx_.commit() cnx_.commit()
return dict_json, linkid return dict_json, linkid
@retry(tries=3,delay=2) @retry(tries=20, delay=2)
def getrequest(url_news): def getrequest(url_news):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
...@@ -74,7 +74,7 @@ def getrequest(url_news): ...@@ -74,7 +74,7 @@ def getrequest(url_news):
raise raise
return res_news return res_news
def get_info(dict_json): def get_info(dict_json, linkid):
# list_all_info = [] # list_all_info = []
# num_caiji = 0 # num_caiji = 0
kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -91,25 +91,32 @@ def get_info(dict_json): ...@@ -91,25 +91,32 @@ def get_info(dict_json):
url_news = dict_json['link'] url_news = dict_json['link']
info_source_code = dict_json['info_source_code'] info_source_code = dict_json['info_source_code']
# try: # while True:
# ip = baseCore.get_proxy() # try:
# res_news = requests.get(url_news, proxies=ip, timeout=20) # ip = baseCore.get_proxy()
# except: # res_news = requests.get(url_news, proxies=ip, timeout=20)
# #400请求失败 # break
# updatewxLink(url_news, info_source_code, 400) # except:
# return False # time.sleep(3)
#400请求失败
# updatewxLink(url_news, info_source_code, 400)
# return False
# 修改请求方法,retry 3次 # 修改请求方法,retry 3次
try: try:
res_news = getrequest(url_news) res_news = getrequest(url_news)
# print(res_news) # print(res_news)
except: except:
try: # 修改回原状态,重新放入redis
res_news = requests.get(url_news, timeout=20) updatewxLink(url_news, info_source_code, 0)
except: log.info(f'{origin}---{news_date}--{news_title}---请求失败-- 重新放入redis')
# 400请求失败 baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
updatewxLink(url_news, info_source_code, 400) # try:
return False # res_news = requests.get(url_news, timeout=20)
# except:
# # 400请求失败
# updatewxLink(url_news, info_source_code, 400)
return False
soup_news = BeautifulSoup(res_news.content, 'html.parser') soup_news = BeautifulSoup(res_news.content, 'html.parser')
try: try:
news_html = soup_news.find('div', {'id': 'js_content'}) news_html = soup_news.find('div', {'id': 'js_content'})
...@@ -120,7 +127,11 @@ def get_info(dict_json): ...@@ -120,7 +127,11 @@ def get_info(dict_json):
del news_html['id'] del news_html['id']
del news_html['class'] del news_html['class']
except Exception as e: except Exception as e:
log.error(f'{url_news}-----{info_source_code}') log.info(f'--errorCode:700--{url_news}-----------{e}')
# log.error(f'{url_news}-----{info_source_code}')
updatewxLink(url_news, info_source_code, 0)
log.info(f'{origin}---{news_date}--{news_title}---style标签解析失败---重新放入redis')
baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
return False return False
try: try:
news_content = news_html.text news_content = news_html.text
...@@ -137,7 +148,7 @@ def get_info(dict_json): ...@@ -137,7 +148,7 @@ def get_info(dict_json):
insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)" insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
cursor_.execute(insertSql, tuple(false)) cursor_.execute(insertSql, tuple(false))
cnx_.commit() cnx_.commit()
updatewxLink(url_news,info_source_code,500) updatewxLink(url_news, info_source_code, 500)
return False return False
list_img = news_html.find_all('img') list_img = news_html.find_all('img')
...@@ -161,22 +172,33 @@ def get_info(dict_json): ...@@ -161,22 +172,33 @@ def get_info(dict_json):
except: except:
img_one.extract() img_one.extract()
continue continue
resp = obsClient.putContent('zzsn', name_img, content=res.content) resp = None
for i in range(10):
try:
resp = obsClient.putContent('zzsn', name_img, content=res.content)
break
except:
time.sleep(2)
if resp:
pass
else:
img_one.extract()
continue
try: try:
url_img = resp['body']['objectUrl'] url_img = resp['body']['objectUrl']
str_url_img = f'<img src="{url_img}">' str_url_img = f'<img src="{url_img}">'
except Exception as e: except Exception as e:
log.info(f'--error--{url_news}-----------{e}') log.info(f'--errorCode:300--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300) updatewxLink(url_news, info_source_code, 300)
return False return False
try: try:
img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img) img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
except Exception as e: except Exception as e:
log.info(f'--error--{url_news}-----------{e}') log.info(f'--errorCode:300--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300) updatewxLink(url_news, info_source_code, 300)
return False return False
except Exception as e: except Exception as e:
log.info(f'--error--{url_news}-----------{e}') log.info(f'--errorCode:600--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 600) updatewxLink(url_news, info_source_code, 600)
return False return False
...@@ -214,7 +236,7 @@ def get_info(dict_json): ...@@ -214,7 +236,7 @@ def get_info(dict_json):
except: except:
time.sleep(5) time.sleep(5)
log.info('------数据发送kafka失败------') log.info('------数据发送kafka失败------')
updatewxLink(url_news,info_source_code,200) updatewxLink(url_news, info_source_code, 200)
continue continue
list_all_info.append(dic_info) list_all_info.append(dic_info)
...@@ -237,7 +259,7 @@ def get_info(dict_json): ...@@ -237,7 +259,7 @@ def get_info(dict_json):
except: except:
time.sleep(5) time.sleep(5)
continue continue
updatewxLink(url_news,info_source_code,100) updatewxLink(url_news, info_source_code, 100)
return True return True
def rm_style_attr(soup): def rm_style_attr(soup):
...@@ -277,7 +299,7 @@ if __name__=="__main__": ...@@ -277,7 +299,7 @@ if __name__=="__main__":
dict_json, linkid =getjsonInfo() dict_json, linkid =getjsonInfo()
try: try:
if dict_json: if dict_json:
if get_info(dict_json): if get_info(dict_json, linkid):
num_caiji = num_caiji + 1 num_caiji = num_caiji + 1
log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----') log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
else: else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论