微信公众号获取正文

a492eb60 · 薛凌堃 · a9ad4913 · a492eb60
--- a/comData/weixin_solo/oneWeixin2.py
+++ b/comData/weixin_solo/oneWeixin2.py
@@ -4,18 +4,14 @@
 '''

 import requests, time, random, json, pymysql, redis
-import pandas as pd
+
 import urllib3
 from bs4 import BeautifulSoup
-from openpyxl import Workbook
-from selenium import webdriver
+
 from obs import ObsClient
 from kafka import KafkaProducer

-# logging.basicConfig(filename='example.log', level=logging.INFO)
-
 from base.BaseCore import BaseCore
-import os
 baseCore = BaseCore()
 log = baseCore.getLogger()
 cnx_ = baseCore.cnx
@@ -25,48 +21,6 @@ cursor_ = baseCore.cursor
 r = baseCore.r
 urllib3.disable_warnings()

-def check_url(sid, article_url):
-    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
-    res = r.sismember(f'wx_url_{sid}',article_url)
-    if res == 1:
-        return True
-    else:
-        return False
-
-def add_url(sid, article_url):
-    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
-    res = r.sadd(f'wx_url_{sid}', article_url, 3)  # 注意是 保存set的方式
-    if res == 0:  # 若返回0,说明插入不成功，表示有重复
-        return True
-    else:
-        return False
-
-# #定时
-# def getFromSql():
-#     selectSql = "SELECT info_source_code from info_source where site_uri like '%mp.weixin.qq.com%'"
-#     cursor.execute(selectSql)
-#     results = cursor.fetchall()
-#     result_list = [item[0] for item in results]
-#
-#     #放入redis
-#     for item in result_list:
-#         r.rpush('WeiXinGZH:infoSourceCode', item)
-#
-# #刷新浏览器并获得token
-# def flushAndGetToken(list_b):
-#     browser_run = list_b[0]
-#     log.info('======刷新浏览器=====')
-#     browser_run.refresh()
-#     cookie_list = browser_run.get_cookies()
-#     cur_url = browser_run.current_url
-#     token = cur_url.split('token=')[1]
-#     log.info(f'===========当前token为：{token}============')
-#     cookies = {}
-#     for cookie in cookie_list:
-#         cookies[cookie['name']] = cookie['value']
-#     return token,cookies
-
-#采集失败的公众号 重新放入redis
 def rePutIntoR(item):
    r.rpush('WeiXinGZH:infoSourceCode', item)

@@ -165,6 +119,7 @@ def get_info(dict_json):
            img_one.extract()
        else:
            try:
+                try:
                    name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
                except:
                    img_one.extract()
@@ -173,14 +128,24 @@ def get_info(dict_json):
                    res = requests.get(url_src, timeout=20)
                except:
                    img_one.extract()
+                    continue
                resp = obsClient.putContent('zzsn', name_img, content=res.content)
-
+                try:
                    url_img = resp['body']['objectUrl']
                    str_url_img = f'<img src="{url_img}">'
+                except Exception as e:
+                    log.info(f'--error--{url_news}-----------{e}')
+                    updatewxLink(url_news, info_source_code, 300)
+                    return False
                try:
                    img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
                except Exception as e:
-                log.info(f'----{url_news}-----------{e}')
+                    log.info(f'--error--{url_news}-----------{e}')
+                    updatewxLink(url_news, info_source_code, 300)
+                    return False
+            except Exception as e:
+                log.info(f'--error--{url_news}-----------{e}')
+                updatewxLink(url_news, info_source_code, 600)
                return False

    for tag in news_html.descendants: