微信公众号采集

54d3083a · XveLingKun · f434a907 · 54d3083a · 54d3083a · 54d3083a
--- a/comData/weixin_solo/oneWeixin2.py
+++ b/comData/weixin_solo/oneWeixin2.py
@@ -24,7 +24,7 @@ r = baseCore.r
 urllib3.disable_warnings()
 def rePutIntoR(item):
-    r.rpush('WeiXinGZH:infoSourceCode', item)
+    r.rpush('WeiXinGZH:linkid', item)
 def updatewxLink(link,info_source_code,state):
    updateSuccess = f"update wx_link set state= {state} where link='{link}' and info_source_code='{info_source_code}' "
@@ -39,7 +39,7 @@ def getjsonInfo():
        pass
    else:
        log.info('-----没有数据了-----')
-        return False
+        return False, False
    #从数据库中获取信息 一条
    select_sql = f"select * from wx_link  where state=0 and id= '{linkid}'"
    cursor_.execute(select_sql)
@@ -49,7 +49,7 @@ def getjsonInfo():
        pass
    else:
        log.info('-----没有数据了-----')
-        return False
+        return False, False
    dict_json = {
        'sid':row[1],
        'site_uri':row[2],
@@ -63,7 +63,7 @@ def getjsonInfo():
    update_sql = f"update wx_link set state=1 where link='{row[7]}' and info_source_code='{row[4]}' "
    cursor_.execute(update_sql)
    cnx_.commit()
-    return dict_json
+    return dict_json, linkid
 @retry(tries=3,delay=2)
 def getrequest(url_news):
@@ -72,7 +72,7 @@ def getrequest(url_news):
    res_news = requests.get(url_news, proxies=ip, timeout=20)
    if res_news.status_code != 200:
        raise
+    return res_news
 def get_info(dict_json):
    # list_all_info = []
@@ -102,6 +102,7 @@ def get_info(dict_json):
    # 修改请求方法,retry 3次
    try:
        res_news = getrequest(url_news)
+        # print(res_news)
    except:
        try:
            res_news = requests.get(url_news, timeout=20)
@@ -118,7 +119,7 @@ def get_info(dict_json):
        news_html = rm_style_attr(news_html)
        del news_html['id']
        del news_html['class']
-    except:
+    except Exception as e:
        log.error(f'{url_news}-----{info_source_code}')
        return False
    try:
@@ -255,9 +256,15 @@ def rm_style_attr(soup):
        except:
            continue
+    # first_div = soup.select('div[id="js_content"]')
+    # # 设置style属性
+    # first_div['style'] = 'width: 814px ; margin: 0 auto;'
    first_div = soup.select('div[id="js_content"]')
-    # 设置style属性
+    if first_div:
-    first_div['style'] = 'width: 814px ; margin: 0 auto;'
+        first_div = first_div[0]  # 获取第一个匹配的元素
+        first_div['style'] = 'width: 814px ; margin: 0 auto;'  # 设置style属性
    return soup
 if __name__=="__main__":
@@ -267,12 +274,15 @@ if __name__=="__main__":
        #一次拿取一篇文章
        # todo: 从redis拿数据 更新mysql状态
-        dict_json =getjsonInfo()
+        dict_json, linkid =getjsonInfo()
-        if dict_json:
+        try:
-            if get_info(dict_json):
+            if dict_json:
-                num_caiji = num_caiji + 1
+                if get_info(dict_json):
-                log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
+                    num_caiji = num_caiji + 1
-        else:
+                    log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
-            continue
+            else:
+                continue
+        except:
+            rePutIntoR(linkid)
    baseCore.close()
\ No newline at end of file
--- a/comData/weixin_solo/test.py
+++ b/comData/weixin_solo/test.py
+# -*- coding: utf-8 -*-
+import re
 import time
 import pandas as pd
@@ -40,6 +43,7 @@ import pandas as pd
 #     else:
 #         pass
 import redis
+from bs4 import BeautifulSoup
 def check_url():
@@ -61,11 +65,51 @@ def test(dic_user_count):
    return dic_user_count
 def test1():
-    dic_user_count = {"A":0}
+    dic_user_count = {}
    for i in range(3):
        dic_user_count = test(dic_user_count)
        print(dic_user_count)
+def rm_style_attr(soup):
+    # 查找所有含有style属性的标签
+    style_tags = soup.find_all(style=True)
+    # 遍历每个style标签
+    for style_tag in style_tags:
+        try:
+            # 使用正则表达式替换
+            styleattr = style_tag['style']
+            styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
+            styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
+            styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
+            styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
+            style_tag['style'] = styleattr
+        except:
+            continue
+    first_div = soup.select('div[id="js_content"]')
+    if first_div:
+        first_div = first_div[0]  # 获取第一个匹配的元素
+        first_div['style'] = 'width: 814px ; margin: 0 auto;'  # 设置style属性
+    # # 设置style属性
+    # first_div['style'] = 'width: 814px ; margin: 0 auto;'
+    # print(first_div)
+    return soup
+def aaa(dic_user_count):
+    for i in range(3):
+        if "A" in dic_user_count:
+            dic_user_count["A"] += 1
+        else:
+            dic_user_count["A"] = 1
 if __name__ == "__main__":
-    test1()
+    # html = """<div class="rich_media_content js_underline_content autoTypeSetting24psection" id="js_content" style="width: 814px ; margin: 0 auto;"><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">3月31日，中国十七冶建安分公司承建的黄石EOD项目大泉路生态廊道工程马鞍山路高架桥顺利通车。<br/></span></section><section style="white-space: normal;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;text-indent: 0em;"><img class="rich_pages wxw-img" data-backh="312" data-backw="578" data-imgfileid="100024978" data-ratio="0.539568345323741" data-src="https://mmbiz.qpic.cn/mmbiz_png/ibEX3YMicPu80BnkeVDWh45k2S5saQQqDfvKJfBiblZO6OjbyWrYSJ3c2fib2eQReXQSLMONFicRD0fT2OdFY4da0og/640?wx_fmt=png&amp;from=appmsg" data-type="png" data-w="695" style="border: none;width: 100%;height: auto;" title=""/></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">马鞍山路高架桥全长315米，设计速度为60km/h，采用双向四车道。该桥横跨马鞍山路，建成后将缓解马鞍山路和大泉路交叉口的交通压力，对减轻市民们的出行焦虑有着重要意义，同时将进一步加强大冶、阳新与黄石城区的联系，为黄石更好融入“武鄂黄黄”都市圈奠定基础。</span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">为全力推动马鞍山路高架桥建成，项目管理团队锚定节点目标不动摇，倒排工期，通过精心组织，优化施工方案，主动出击，联合行政主管部门多次召开推进会，克服施工过程中的管线迁改难题，为马鞍山路高架桥顺利通车提供坚实组织保障。</span></section><section style="white-space: normal;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;text-indent: 0em;"><img class="rich_pages wxw-img" data-backh="374" data-backw="578" data-imgfileid="100024979" data-ratio="0.6474820143884892" data-src="https://mmbiz.qpic.cn/mmbiz_png/ibEX3YMicPu80BnkeVDWh45k2S5saQQqDfZBWgkUaG20wMwKDxZ4SjjFwZg0bvicuAcLPUia6ECktibM9KPj5NIrmicg/640?wx_fmt=png&amp;from=appmsg" data-type="png" data-w="695" style="border: none;width: 100%;height: auto;" title=""/></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">项目部将牢牢把握“十七冶发展要义”，深入践行“24小时工作法”，全力打造精品工程、安全工程、民生工程,为黄石打造武汉都市圈贡献十七冶力量。</span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;"></span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;"></span></section><section style="margin-top: 16px;margin-bottom: 16px;white-space: normal;text-indent: 2em;line-height: 1.75em;"><section data-id="103115" data-role="splitline" data-tools="135编辑器" style="outline: 0px;font-variant-ligatures: normal;letter-spacing: 0.544px;orphans: 2;widows: 2;background-color: rgb(255, 255, 255);color: rgb(34, 34, 34);font-size: 16px;font-family: 微软雅黑;text-indent: 32px;overflow-wrap: break-word !important;"><section style="margin: 16px auto;outline: 0px;text-align: center;text-indent: 2em;line-height: 1.75em;overflow-wrap: break-word !important;"><section style="outline: 0px;display: flex;justify-content: center;align-items: center;overflow-wrap: break-word !important;"><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"><br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"> <br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"><br style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 4px;outline: 0px;height: 1px;background-color: rgb(35, 35, 35);flex: 1 1 0%;overflow: hidden;overflow-wrap: break-word !important;"><br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section></section></section></section><section style="margin: 16px 8px;outline: 0px;font-variant-ligatures: normal;letter-spacing: 0.544px;orphans: 2;text-indent: 0em;widows: 2;background-color: rgb(255, 255, 255);color: rgb(34, 34, 34);font-family: 微软雅黑;text-align: left;line-height: 1.75em;overflow-wrap: break-word !important;"><span style="outline: 0px;font-size: 14px;color: rgb(136, 136, 136);overflow-wrap: break-word !important;">来源：中国企业网</span><img class="rich_pages wxw-img __bg_gif" data-backh="192" data-backw="320" data-fileid="100002618" data-galleryid="" data-imgfileid="100024527" data-ratio="0.6" data-src="https://mmbiz.qpic.cn/mmbiz_gif/ibEX3YMicPu82TZf4RScpazSD7OuViaH4cEUx9rCibPavn2cJXiagJrmVuVTpOJgibBV8368H2RYxxYp3Fhn1a7SU20Q/640?wx_fmt=gif" data-type="gif" data-w="1000" style="letter-spacing: 0.544px;text-indent: 0em;outline: 0px;color: rgb(136, 136, 136);font-size: 15px;text-align: center;width: 562px;visibility: visible !important;height: auto !important;" width="320px"/></section></section><p style="display: none;"><mp-style-type data-value="3"></mp-style-type></p></div>"""
+    # soup = BeautifulSoup(html, 'html.parser')
+    # soup = rm_style_attr(soup)
+    # print(soup)
+    dic_user_count = {}
+    aaa(dic_user_count)
+    if dic_user_count:
+        for key, value in dic_user_count.items():
+            print(f"====账号{key}，采集公众号个数{value}")
--- a/comData/weixin_solo/wxList.py
+++ b/comData/weixin_solo/wxList.py
@@ -10,8 +10,8 @@ import urllib3
 from pymysql.converters import escape_string
 import sys
-sys.path.append('D:\\zzsn_spider\\base')
+# sys.path.append('D:\\zzsn\\base')
-import BaseCore
+from base import BaseCore
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 baseCore = BaseCore.BaseCore()
@@ -184,7 +184,7 @@ def getToken():
    return row[0]
-# 获取列表数据
+# 获取列表数据  每一页换一次公众号
 def getPageData(dic_url, page, dic_user_count):
    url_ = dic_url['url_']
    origin = dic_url['name']
@@ -206,7 +206,6 @@ def getPageData(dic_url, page, dic_user_count):
    user_name = tokenAndCookie[2]
    token = tokenAndCookie[0]
    log.info(f"获取token到----{token}----{user_name}")
-    dic_user_count[user_name] = 0
    cookies = json.loads(tokenAndCookie[1])
    # s.cookies.update(cookies)
@@ -223,18 +222,22 @@ def getPageData(dic_url, page, dic_user_count):
    str_t = json.dumps(json_search)
    ret = json_search['base_resp']['ret']
    if ret == 0:
-        dic_user_count[user_name] += 1
+        # 使用一次就记录一次
-        pass
+        if user_name in dic_user_count:
+            dic_user_count[user_name] += 1
+        else:
+            dic_user_count[user_name] = 1
    elif ret == 200013:
        log.info(f'======{origin}-----{biz}----{user_name}账号被封=======')
        # 封号修改token
        updateTokeen(token, 1)
-        return getPageData(dic_url, page, dic_user_count)
+        return getPageData(dic_url, page, dic_user_count), dic_user_count
    elif ret == 200002:
        log.info(f'======{origin}-----{biz}----该公众号号biz错误，请检查=======')
        error = [origin, url_, info_source_code, str_t, '无效biz参数']
        insertBadSql(error)
-        return True
+        return True, dic_user_count
    elif ret == 200003:
        log.info(f'======{origin}-----{biz}----{user_name}账号无效session=======')
        # session失效修改token
@@ -255,7 +258,7 @@ def getPageData(dic_url, page, dic_user_count):
        error = [origin, url_, info_source_code, str_t, '其他错误']
        insertBadSql(error)
        updateTokeen(token, 2)
-        return True
+        return True, dic_user_count
    # 修改token使用时间
    updateTokeen(token, 3)
    # 保存数据到数据库
@@ -263,7 +266,7 @@ def getPageData(dic_url, page, dic_user_count):
 # 获取微信公众号数据
-def getWxList(infoSourceCode):
+def getWxList(infoSourceCode, dic_user_count):
    dic_url = getSourceInfo(infoSourceCode)
    log.info(f"======{infoSourceCode}----开始采集=======")
@@ -276,7 +279,7 @@ def getWxList(infoSourceCode):
        return
    origin = dic_url['name']
    biz = dic_url['biz']
-    dic_user_count = {}
    for page in range(1, 6):
        retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
        time.sleep(random.randint(60, 181))
@@ -286,8 +289,6 @@ def getWxList(infoSourceCode):
        else:
            # 没有结束
            pass
-    for key, value in dic_user_count.items():
-        log.info(f"====账号{key}，采集公众号个数{value}")
    log.info(f"======{origin}-----{biz}----结束采集=======")
@@ -310,17 +311,26 @@ def getnumber_redis():
 if __name__ == "__main__":
+    # getFromSql()
    numbers = getnumber_redis()
    log.info("当前批次采集公众号个数{}".format(numbers))
    time.sleep(3)
+    dic_user_count = {}
+    # dic_user_count = {
+    #     'name': '',
+    #     'use_count': 0,
+    #     'gzh_count': 0
+    # }
    while True:
        start = time.time()
        log.info(f"开始时间{baseCore.getNowTime(1)}")
        infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
+        # infoSourceCode = 'IN-20220609-57899'
        if infoSourceCode == 'None' or infoSourceCode == None:
            log.info("redis已经没有数据了，重新放置数据")
            log.info(f"采集完一轮公众号耗时{baseCore.getTimeCost(start, time.time())}")
-            # getFromSql()
            # time.sleep(60)
            # numbers = getnumber_redis()
            # log.info("当前批次采集公众号个数{}".format(numbers))
@@ -328,7 +338,10 @@ if __name__ == "__main__":
            # infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
            continue
-        getWxList(infoSourceCode)
+        getWxList(infoSourceCode, dic_user_count)
+        if dic_user_count:
+            for key, value in dic_user_count.items():
+                log.info(f"====账号{key}，使用次数{value}")
+        # break
    # infoSourceCode = 'IN-20220917-0159'
    # getWxList(infoSourceCode)