微信公众号

475c1e2c · 薛凌堃 · db2408bf · 475c1e2c
--- a/comData/weixin_solo/oneWeixin.py
+++ b/comData/weixin_solo/oneWeixin.py
@@ -93,7 +93,7 @@ def flushAndGetToken(list_b):
 def rePutIntoR(item):
    r.rpush('WeiXinGZH:infoSourceCode', item)

-def get_info(sid,json_search,origin,info_source_code):
+def get_info(sid,json_search,origin,url_,info_source_code,page):
    list_all_info = []
    num_caiji = 0
    kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -103,7 +103,7 @@ def get_info(sid,json_search,origin,info_source_code):
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )
    list_all_news = json_search['app_msg_list']
-
+    #采集第几篇文章
    for one_news in list_all_news:
        news_title = one_news['title']
        timestamp = one_news['create_time']
@@ -114,10 +114,12 @@ def get_info(sid,json_search,origin,info_source_code):

        url_ft = check_url(sid, url_news)
        if url_ft:
-            log.info(f'已采过该篇文章----文章链接-----{url_news}')
+            log.info(f'-----{origin}--第{page}页--已采过该篇文章--文章链接--{url_news}-----')
            return list_all_info,num_caiji
        try:
-            res_news = requests.get(url_news, timeout=20)
+            ip = get_proxy()[random.randint(0, 3)]
+            res_news = requests.get(url_news, timeout=20,proxies=ip)
+            time.sleep(2)
        except:
            continue
        soup_news = BeautifulSoup(res_news.content, 'html.parser')
@@ -132,16 +134,17 @@ def get_info(sid,json_search,origin,info_source_code):
        try:
            news_content = news_html.text
        except:
-            log.info(f'--------内容为空--------{url_news}--------')
+            log.info(f'----{origin}--第{page}页--该篇文章内容为空--{url_news}-----')
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            false = [
                news_title,
-                url_news,
-                news_html,
+                url_,
+                info_source_code,
                '文章内容为空',
-                time_now
+                time_now,
+                url_news
            ]
-            insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
+            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,error_type,create_time,news_url) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(false))
            cnx_.commit()
            continue
@@ -182,6 +185,8 @@ def get_info(sid,json_search,origin,info_source_code):
            section.name = 'div'

        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        #将信息传输到kafka中
+
        dic_info = {
            'sid': sid,
            'title': news_title,
@@ -207,6 +212,7 @@ def get_info(sid,json_search,origin,info_source_code):
                continue
        num_caiji = num_caiji + 1
        list_all_info.append(dic_info)
+        time.sleep(5)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_info2 = {
        'infoSourceId': sid,
@@ -242,7 +248,7 @@ def job(count,key):
        getFromSql()
        time.sleep(20)
        log.info(f'========本次公众号已采集完毕，共采集{count}个公众号=========总耗时：{baseCore.getTimeCost(start_,time.time())}')
-        return
+        return count

    sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
    # '一带一路百人论坛'
@@ -282,140 +288,169 @@ def job(count,key):
        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
        cursor_.execute(insertSql, tuple(error))
        cnx_.commit()
-        return
+        return count
    fakeid = biz + '=='

    url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
-    #              https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
+    #获取页数
    try:
        ip = get_proxy()[random.randint(0, 3)]
        json_search = s.get(url_search, headers=headers, proxies=ip,
-                            verify=False).json()  # , proxies=ip, verify=False
-        str_t = json.dumps(json_search)
-        time.sleep(2)
-
+                            verify=False).json()
+        time.sleep(1)
    except Exception as e:
        log.error(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}======={e}===')
        rePutIntoR(info_source_code)
-        return
-    #{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
-    # TODO:需要判断返回值，根据返回值判断是封号还是biz错误
-    # {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}=========   封号
-    # {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}}    公众号biz错误 链接
-    # 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
-    ret = json_search['base_resp']['ret']
-    if ret == 0:
-        pass
-    elif ret == 200013:
-        # 重新放入redis
-        # time.sleep(3600)
-        # 刷新 暂时用一下方法
-        rePutIntoR(info_source_code)
-        log.info(f'======该账号被封=======')
-        # for i in range(0,6):   #600,1200,1800,2400,3000,3600
-        #     #刷新
-        #     time.sleep(600)
-        #     log.info('=======等待时间600秒=====刷新浏览器=====')
-        #     browser_run = list_b[0]
-        #     browser_run.refresh()
-
-        r.set(key, 50)
-        r.expire(key, 3600)
-        return
-    elif ret == 200002:
-        # 公众号链接错误 保存库里 记录错误信息及错误类型
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        error = [
-            origin,
-            url_,
-            info_source_code,
-            str_t,
-            '无效biz参数',
-            time_now
-        ]
-        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
-        cursor_.execute(insertSql, tuple(error))
-        cnx_.commit()
-        log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}')
-        return
-    elif ret == 200003:
-        # 无效的session
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        error = [
-            origin,
-            url_,
-            info_source_code,
-            str_t,
-            '无效session',
-            time_now
-        ]
-        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
-        cursor_.execute(insertSql, tuple(error))
-        cnx_.commit()
-        log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}')
-        return
-    else:
-        log.info(f'----其他情况-----{json_search}---公众号{origin}------')
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        error = [
-            origin,
-            url_,
-            info_source_code,
-            str_t,
-            '其他错误',
-            time_now
-        ]
-        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
-        cursor_.execute(insertSql, tuple(error))
-        cnx_.commit()
-        return
-
-    list_all = json_search['app_msg_list']
+        return count
    try:
-        list_all_info,num_caiji= get_info(sid,json_search,origin,info_source_code)
-        print(f'----------{len(list_all_info)}------{num_caiji}-------')
-        time.sleep(2)
-        if len(list_all_info) != 0:
-            count += 1
+        Max_data = int(json_search['app_msg_cnt'])
+        Max_page = int(int(json_search['app_msg_cnt']) / 5)
+        if int(json_search['app_msg_cnt']) % 5 != 0:
+            Max_page = Max_page + 1
+        else:
+            Max_page = Max_page
+    except:
+        Max_page = 1
+        Max_data = 5
+    log.info(f'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----')
+    for i in range(0, Max_data, 5):
+        url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
+
+    # url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
+    #              https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
+        try:
+            ip = get_proxy()[random.randint(0, 3)]
+            json_search = s.get(url_search, headers=headers, proxies=ip,
+                                verify=False).json()  # , proxies=ip, verify=False
+            str_t = json.dumps(json_search)
+            time.sleep(2)
+
+        except Exception as e:
+            log.error(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}======={e}===')
+            rePutIntoR(info_source_code)
+            return count
+        #{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
+        # TODO:需要判断返回值，根据返回值判断是封号还是biz错误
+        # {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}=========   封号
+        # {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}}    公众号biz错误 链接
+        # 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
+        ret = json_search['base_resp']['ret']
+        if ret == 0:
+            pass
+        elif ret == 200013:
+            # 重新放入redis
+            # time.sleep(3600)
+            # 刷新 暂时用一下方法
+            rePutIntoR(info_source_code)
+            log.info(f'======该账号被封=======')
+            # for i in range(0,6):   #600,1200,1800,2400,3000,3600
+            #     #刷新
+            #     time.sleep(600)
+            #     log.info('=======等待时间600秒=====刷新浏览器=====')
+            #     browser_run = list_b[0]
+            #     browser_run.refresh()
+
+            r.set(key, 50)
+            r.expire(key, 3600)
+            return count
+        elif ret == 200002:
+            # 公众号链接错误 保存库里 记录错误信息及错误类型
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-            success = [
+            error = [
                origin,
                url_,
                info_source_code,
-                '采集成功',
-                num_caiji,
+                str_t,
+                '无效biz参数',
                time_now
            ]
-            #成功信息保存
-            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (%s,%s,%s,%s,%s,%s)"
-            cursor_.execute(insertSql, tuple(success))
+            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
+            cursor_.execute(insertSql, tuple(error))
            cnx_.commit()
-            # 该公众号的所有文章采集完成
-            log.info(f'{fakeid}、公众号{origin}:采集成功！、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_,time.time())}')
+            log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}')
+            return count
+        elif ret == 200003:
+            # 无效的session
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            error = [
+                origin,
+                url_,
+                info_source_code,
+                str_t,
+                '无效session',
+                time_now
+            ]
+            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
+            cursor_.execute(insertSql, tuple(error))
+            cnx_.commit()
+            log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}')
+            return count
        else:
-            log.info(f'{fakeid}、公众号{origin}、网址已存在！耗时{baseCore.getTimeCost(start_,time.time())}')
-    except Exception as e:
-        # json解析该公众号成功但采集数据失败
-        count += 1
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        false = [
-            origin,
-            url_,
-            info_source_code,
-            e,
-            '采集失败',
-            time_now
-        ]
-        # 失败信息保存
-        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
-        cursor_.execute(insertSql, tuple(false))
-        cnx_.commit()
-        log.info(f'{fakeid}、公众号：{origin}采集失败！！！！！！耗时{baseCore.getTimeCost(start_, time.time())}')
+            log.info(f'----其他情况-----{json_search}---公众号{origin}------')
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            error = [
+                origin,
+                url_,
+                info_source_code,
+                str_t,
+                '其他错误',
+                time_now
+            ]
+            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
+            cursor_.execute(insertSql, tuple(error))
+            cnx_.commit()
+            return count

-    time.sleep(2)
-    return count
+        list_all = json_search['app_msg_list']
+        try:
+            #开始采集每一页文章信息
+            page = int(i/5+1)
+            log.info(f'---{origin}---------开始采集第{page}个分页-----------')
+            list_all_info,num_caiji= get_info(sid,json_search,origin,url_,info_source_code,page)
+            print(f'----第{page}页采集到文章个数-----{len(list_all_info)}------{num_caiji}-------')
+            time.sleep(2)
+            if len(list_all_info) != 0:
+                count += 1
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                success = [
+                    origin,
+                    url_,
+                    info_source_code,
+                    '采集成功',
+                    num_caiji,
+                    time_now,
+                ]
+                #成功信息保存
+                insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (%s,%s,%s,%s,%s,%s)"
+                cursor_.execute(insertSql, tuple(success))
+                cnx_.commit()
+                # 该公众号的所有文章采集完成
+                log.info(f'---第{page}页采集到文章个数---{len(list_all_info)}---{num_caiji}---耗时{baseCore.getTimeCost(start_,time.time())}')
+            else:
+                log.info(f'----第{page}页采集到文章个数{num_caiji}--网址已存在！-----耗时{baseCore.getTimeCost(start_,time.time())}')
+                return count
+        except Exception as e:
+            # json解析该公众号成功但采集数据失败

+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            false = [
+                origin,
+                url_,
+                info_source_code,
+                e,
+                '采集失败',
+                time_now
+            ]
+            # 失败信息保存
+            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
+            cursor_.execute(insertSql, tuple(false))
+            cnx_.commit()
+            log.info(f'{fakeid}、公众号：{origin}采集失败！！！！！！耗时{baseCore.getTimeCost(start_, time.time())}')
+    count += 1
+    log.info(f'{fakeid}、公众号{origin}:采集成功！、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}')

+    time.sleep(2)
+    return count

 if __name__=="__main__":
    requests.DEFAULT_RETRIES = 5
@@ -463,8 +498,12 @@ if __name__=="__main__":
        new_value = baseCore.incrSet(key)
        baseCore.getttl(key)
        if new_value < 50:
-            aa = job(count,key)
-            count = aa
+            try:
+                aa = job(count,key)
+                count = aa
+                time.sleep(20)
+            except:
+                time.sleep(10)
        else:
            #刷新浏览器
            browser_run = list_b[0]