微信公众号详情采集调整

6da55a3e · XveLingKun · c12087c7 · 6da55a3e
--- a/comData/weixin_solo/oneWeixin2.py
+++ b/comData/weixin_solo/oneWeixin2.py
 # -*- coding: utf-8 -*-
 '''
-成功100 发送数据失败200  请求失败400  文章内容为空500
+成功100 发送数据失败200  请求失败400  文章内容为空500  处理style标签失败700
 '''
 import re

@@ -65,7 +65,7 @@ def getjsonInfo():
    cnx_.commit()
    return dict_json, linkid

-@retry(tries=3,delay=2)
+@retry(tries=20, delay=2)
 def getrequest(url_news):

    ip = baseCore.get_proxy()
@@ -74,7 +74,7 @@ def getrequest(url_news):
        raise
    return res_news

-def get_info(dict_json):
+def get_info(dict_json, linkid):
    # list_all_info = []
    # num_caiji = 0
    kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -91,25 +91,32 @@ def get_info(dict_json):
    url_news = dict_json['link']
    info_source_code = dict_json['info_source_code']

-    # try:
-    #     ip = baseCore.get_proxy()
-    #     res_news = requests.get(url_news, proxies=ip, timeout=20)
-    # except:
-    #     #400请求失败
-    #     updatewxLink(url_news, info_source_code, 400)
-    #     return False
+    # while True:
+    #     try:
+    #         ip = baseCore.get_proxy()
+    #         res_news = requests.get(url_news, proxies=ip, timeout=20)
+    #         break
+    #     except:
+    #         time.sleep(3)
+            #400请求失败
+            # updatewxLink(url_news, info_source_code, 400)
+            # return False

    # 修改请求方法,retry 3次
    try:
        res_news = getrequest(url_news)
        # print(res_news)
    except:
-        try:
-            res_news = requests.get(url_news, timeout=20)
-        except:
-            # 400请求失败
-            updatewxLink(url_news, info_source_code, 400)
-            return False
+        # 修改回原状态，重新放入redis
+        updatewxLink(url_news, info_source_code, 0)
+        log.info(f'{origin}---{news_date}--{news_title}---请求失败-- 重新放入redis')
+        baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
+        # try:
+        #     res_news = requests.get(url_news, timeout=20)
+        # except:
+        #     # 400请求失败
+        #     updatewxLink(url_news, info_source_code, 400)
+        return False
    soup_news = BeautifulSoup(res_news.content, 'html.parser')
    try:
        news_html = soup_news.find('div', {'id': 'js_content'})
@@ -120,7 +127,11 @@ def get_info(dict_json):
        del news_html['id']
        del news_html['class']
    except Exception as e:
-        log.error(f'{url_news}-----{info_source_code}')
+        log.info(f'--errorCode:700--{url_news}-----------{e}')
+        # log.error(f'{url_news}-----{info_source_code}')
+        updatewxLink(url_news, info_source_code, 0)
+        log.info(f'{origin}---{news_date}--{news_title}---style标签解析失败---重新放入redis')
+        baseCore.rePutIntoR('WeiXinGZH:linkid', linkid)
        return False
    try:
        news_content = news_html.text
@@ -137,7 +148,7 @@ def get_info(dict_json):
        insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
        cursor_.execute(insertSql, tuple(false))
        cnx_.commit()
-        updatewxLink(url_news,info_source_code,500)
+        updatewxLink(url_news, info_source_code, 500)
        return False

    list_img = news_html.find_all('img')
@@ -161,22 +172,33 @@ def get_info(dict_json):
                except:
                    img_one.extract()
                    continue
-                resp = obsClient.putContent('zzsn', name_img, content=res.content)
+                resp = None
+                for i in range(10):
+                    try:
+                        resp = obsClient.putContent('zzsn', name_img, content=res.content)
+                        break
+                    except:
+                        time.sleep(2)
+                if resp:
+                    pass
+                else:
+                    img_one.extract()
+                    continue
                try:
                    url_img = resp['body']['objectUrl']
                    str_url_img = f'<img src="{url_img}">'
                except Exception as e:
-                    log.info(f'--error--{url_news}-----------{e}')
+                    log.info(f'--errorCode:300--{url_news}-----------{e}')
                    updatewxLink(url_news, info_source_code, 300)
                    return False
                try:
                    img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
                except Exception as e:
-                    log.info(f'--error--{url_news}-----------{e}')
+                    log.info(f'--errorCode:300--{url_news}-----------{e}')
                    updatewxLink(url_news, info_source_code, 300)
                    return False
            except Exception as e:
-                log.info(f'--error--{url_news}-----------{e}')
+                log.info(f'--errorCode:600--{url_news}-----------{e}')
                updatewxLink(url_news, info_source_code, 600)
                return False

@@ -214,7 +236,7 @@ def get_info(dict_json):
        except:
            time.sleep(5)
            log.info('------数据发送kafka失败------')
-            updatewxLink(url_news,info_source_code,200)
+            updatewxLink(url_news, info_source_code, 200)
            continue

    list_all_info.append(dic_info)
@@ -237,7 +259,7 @@ def get_info(dict_json):
        except:
            time.sleep(5)
            continue
-    updatewxLink(url_news,info_source_code,100)
+    updatewxLink(url_news, info_source_code, 100)
    return True

 def rm_style_attr(soup):
@@ -277,7 +299,7 @@ if __name__=="__main__":
        dict_json, linkid =getjsonInfo()
        try:
            if dict_json:
-                if get_info(dict_json):
+                if get_info(dict_json, linkid):
                    num_caiji = num_caiji + 1
                    log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
            else: