微信公众号

5dc4e829 · 薛凌堃 · 060ce7c4 · 5dc4e829 · 5dc4e829 · 5dc4e829
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -6,16 +6,17 @@ import sys
 import time
 import logbook
 import logbook.more
+import pandas as pd
 import zhconv
 import pymysql
 import redis
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
+from openpyxl import Workbook
+import langid
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
-import langid
 class BaseCore:
@@ -475,6 +476,16 @@ class BaseCore:
            return 'cn'
        return result[0]
+    def writerToExcel(self,detailList,filename):
+        # filename='baidu搜索.xlsx'
+        # 读取已存在的xlsx文件
+        existing_data = pd.read_excel(filename,engine='openpyxl')
+        # 创建新的数据
+        new_data = pd.DataFrame(data=detailList)
+        # 将新数据添加到现有数据的末尾
+        combined_data = existing_data.append(new_data, ignore_index=True)
+        # 将结果写入到xlsx文件
+        combined_data.to_excel(filename, index=False)
+        # return combined_data
--- a/comData/weixin_solo/oneWeixin.py
+++ b/comData/weixin_solo/oneWeixin.py
@@ -6,6 +6,7 @@ import requests, time, random, json, pymysql, redis
 import pandas as pd
 import urllib3
 from bs4 import BeautifulSoup
+from openpyxl import Workbook
 from selenium import webdriver
 from obs import ObsClient
 from kafka import KafkaProducer
@@ -13,6 +14,7 @@ from kafka import KafkaProducer
 # logging.basicConfig(filename='example.log', level=logging.INFO)
 from base.BaseCore import BaseCore
+import os
 baseCore = BaseCore()
 log = baseCore.getLogger()
@@ -22,7 +24,7 @@ urllib3.disable_warnings()
 def check_url(sid, article_url):
    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
    res = r.sismember(f'wx_url_{sid}', article_url)  # 注意是 保存set的方式
-    if res == 1:  # 若返回0,说明插入不成功，表示有重复
+    if res == 1:
        return True
    else:
        return False
@@ -79,9 +81,9 @@ def get_info(json_search):
        url_news = one_news['link']
-        url_ft = check_url(sid, url_news)
+        # url_ft = check_url(sid, url_news)
-        if url_ft:
+        # if url_ft:
-            return list_all_info,url_news,news_title
+        #     return list_all_info,url_news,news_title
        try:
            res_news = requests.get(url_news, timeout=20)
        except:
@@ -147,16 +149,16 @@ def get_info(json_search):
            'source': '11',
            'createDate': time_now
        }
-        for nnn in range(0, 3):
+        # for nnn in range(0, 3):
-            try:
+        #     try:
-                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+        #         producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
-                kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
+        #         kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
-                kafka_time_out = kafka_result.get(timeout=10)
+        #         kafka_time_out = kafka_result.get(timeout=10)
-                add_url(sid, url_news)
+        #         # add_url(sid, url_news)
-                break
+        #         break
-            except:
+        #     except:
-                time.sleep(5)
+        #         time.sleep(5)
-                continue
+        #         continue
        num_caiji = num_caiji + 1
        list_all_info.append(dic_info)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -169,15 +171,15 @@ def get_info(json_search):
        'dispatcherStatus': '1',
        'source': '1',
    }
-    for nnn2 in range(0, 3):
+    # for nnn2 in range(0, 3):
-        try:
+    #     try:
-            producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    #         producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
-            kafka_result2 = producer2.send("collectionAndDispatcherInfo",
+    #         kafka_result2 = producer2.send("collectionAndDispatcherInfo",
-                                           json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
+    #                                        json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
-            break
+    #         break
-        except:
+    #     except:
-            time.sleep(5)
+    #         time.sleep(5)
-            continue
+    #         continue
    return list_all_info,url_news,news_title
 if __name__=="__main__":
@@ -227,7 +229,8 @@ if __name__=="__main__":
        cookies[cookie['name']] = cookie['value']
    s = requests.session()
+    # 记录运行公众号的个数
+    count = 0
    while True:
        all = []
        list_all_info = []
@@ -306,14 +309,13 @@ if __name__=="__main__":
            fakeid = biz + '=='
            url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
-            #记录运行公众号的个数
-            count = 0
            try:
                ip = get_proxy()[random.randint(0, 3)]
                json_search = s.get(url_search, headers=headers, proxies=ip,
                                    verify=False).json()  # , proxies=ip, verify=False
                time.sleep(2)
-                break
            except:
                log.info(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}===')
                # error_text = str(json_search)
@@ -340,9 +342,18 @@ if __name__=="__main__":
                # df_error_biz.to_excel(f'./错误biz/{excel_name}.xlsx', index=False)
                #改为：
-                with pd.ExcelWriter(f'./错误biz/{excel_name}2.xlsx', engine='xlsxwriter',
+                file_path = f'./错误biz/{excel_name}.xlsx'
-                                    options={'strings_to_urls': False}) as writer:
+                if os.path.exists(file_path):
-                    df_error_biz.to_excel(writer, index=False)
+                    pass
+                else:
+                    workbook = Workbook()
+                    workbook.save(file_path)
+                    workbook.close()
+                # with pd.ExcelWriter(file_path, engine='xlsxwriter',
+                #                     options={'strings_to_urls': False}) as writer:
+                baseCore.writerToExcel(df_error_biz, file_path)
+                    # combined_data.to_excel(writer, index=False)
                bb = time.sleep(3600)
                log.info(f'========当前账号可能被封，等待时长{bb}======')
@@ -363,25 +374,42 @@ if __name__=="__main__":
                try:
                    list_all_info,url_news,news_title = get_info(json_search)
-                    time.sleep(10)
+                    time.sleep(2)
                    count += 1
                    if len(list_all_info):
                        for dic_one in list_all_info:
                            all.append(dic_one)
-                        df_info = pd.DataFrame(all)
+                        # df_info = pd.DataFrame(all)
                        excel_name = time.strftime("%Y-%m-%d", time.localtime())
                        try:
+                            file_path = f'./运行结果/{excel_name}_实时数据.xlsx'
+                            if os.path.exists(file_path):
+                                pass
+                            else:
+                                workbook = Workbook()
+                                workbook.save(file_path)
+                                workbook.close()
                            # df_info.to_excel(f'./运行结果/{excel_name}_实时数据.xlsx', index=False)
-                            with pd.ExcelWriter(f'./运行结果/{excel_name}_实时数据.xlsx', engine='xlsxwriter',
+                            # with pd.ExcelWriter(file_path, engine='xlsxwriter',
-                                                options={'strings_to_urls': False}) as writer:
+                            #                     options={'strings_to_urls': False}) as writer:
-                                df_info.to_excel(writer, index=False)
+                            baseCore.writerToExcel(all,file_path)
+                                # combined_data.to_excel(writer, index=False)
                        except:
+                            file_path = f'./运行结果/{excel_name}_2_实时数据.xlsx'
+                            if os.path.exists(file_path):
+                                pass
+                            else:
+                                workbook = Workbook()
+                                workbook.save(file_path)
+                                workbook.close()
                            # df_info.to_excel(f'./运行结果/{excel_name}_2_实时数据.xlsx', index=False)
-                            with pd.ExcelWriter(f'./运行结果/{excel_name}_2_实时数据.xlsx', engine='xlsxwriter',
+                            # with pd.ExcelWriter(file_path, engine='xlsxwriter',
-                                                options={'strings_to_urls': False}) as writer:
+                            #                     options={'strings_to_urls': False}) as writer:
-                                df_info.to_excel(writer, index=False)
+                            baseCore.writerToExcel(all, file_path)
+                                # combined_data.to_excel(writer, index=False)
                        # 该公众号的所有文章采集完成
                        # print(f'{fakeid}:采集成功！')
                        log.info(f'{fakeid}、公众号{origin}:采集成功！、已采集{count}个公众号')
@@ -401,9 +429,18 @@ if __name__=="__main__":
                    df_error_url = pd.DataFrame({'公众号：': get_error_origin,
                                                 'code': get_error_code,
                                                 '信息': list_error_url})
+                    file_path = f'./保存失败/{excel_name}.xlsx'
+                    if os.path.exists(file_path):
+                        pass
+                    else:
+                        workbook = Workbook()
+                        workbook.save(file_path)
+                        workbook.close()
                    # df_error_url.to_excel(f'./保存失败/{excel_name}.xlsx', index=False)
-                    with pd.ExcelWriter(f'./保存失败/{excel_name}.xlsx',engine='xlsxwriter',options={'strings_to_urls':False}) as writer:
+                    # with pd.ExcelWriter(file_path,engine='xlsxwriter',options={'strings_to_urls':False}) as writer:
-                        df_error_url.to_excel(writer,index=False)
+                    baseCore.writerToExcel(df_error_url, file_path)
+                        # combined_data.to_excel(writer,index=False)
                    time.sleep(1)
            else:
@@ -418,24 +455,19 @@ if __name__=="__main__":
                df_error_json = pd.DataFrame({'公众号：': json_error_origin,
                                             'code': json_error_code,
                                             '信息': json_error_biz})
+                file_path = f'./错误文件/{time_end}.xlsx'
+                if os.path.exists(file_path):
+                    pass
+                else:
+                    workbook = Workbook()
+                    workbook.save(file_path)
+                    workbook.close()
                # df_error_json.to_excel(f'./错误文件/{time_end}.xlsx', index=False)
-                with pd.ExcelWriter(f'./错误文件/{time_end}.xlsx', engine='xlsxwriter',
+                # with pd.ExcelWriter(file_path, engine='xlsxwriter',
-                                    options={'strings_to_urls': False}) as writer:
+                #                     options={'strings_to_urls': False}) as writer:
-                    df_error_json.to_excel(writer, index=False)
+                baseCore.writerToExcel(df_error_json, file_path)
+                    # combined_data.to_excel(writer, index=False)
-                # error_text_txt = fakeid
-                # with open(f'./错误文件/{time_end}.txt', 'w') as f:
-                #     f.write(error_text_txt)
-                # time.sleep(2)
-                # browser_run = list_b[0]
-                # browser_run.refresh()
-                # cookie_list = browser_run.get_cookies()
-                # cur_url = browser_run.current_url
-                # token = cur_url.split('token=')[1]
-                # print(token)
-                # cookies = {}
-                # for cookie in cookie_list:
-                #     cookies[cookie['name']] = cookie['value']
        time_end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        log.info(f'运行结束，时间为：{time_end}')

--- a/comData/weixin_solo/test.py
+++ b/comData/weixin_solo/test.py
+import pandas as pd
+def writeaa():
+    detailList=[]
+    aa={
+        'id':3,
+        'name':'qqqwe'
+    }
+    detailList.append(aa)
+    writerToExcel(detailList)
+# 将数据追加到excel
+def writerToExcel(detailList):
+    # filename='baidu搜索.xlsx'
+    # 读取已存在的xlsx文件
+    existing_data = pd.read_excel(filename,engine='openpyxl')
+    # 创建新的数据
+    new_data = pd.DataFrame(data=detailList)
+    # 将新数据添加到现有数据的末尾
+    combined_data = existing_data.append(new_data, ignore_index=True)
+    # 将结果写入到xlsx文件
+    combined_data.to_excel(filename, index=False)
+from openpyxl import Workbook
+if __name__ == '__main__':
+    filename='test1.xlsx'
+    # # 创建一个工作簿
+    workbook = Workbook(filename)
+    workbook.save(filename)
+    writeaa()
--- a/comData/weixin_solo/test1.xlsx
+++ b/comData/weixin_solo/test1.xlsx
--- a/comData/weixin_solo/test2.xlsx
+++ b/comData/weixin_solo/test2.xlsx
--- a/comData/weixin_solo/保存失败/2023-08-14.xls
+++ b/comData/weixin_solo/保存失败/2023-08-14.xls
--- a/comData/weixin_solo/保存失败/2023-08-14.xlsx
+++ b/comData/weixin_solo/保存失败/2023-08-14.xlsx
--- a/comData/weixin_solo/运行结果/2023-08-14_2_实时数据.xls
+++ b/comData/weixin_solo/运行结果/2023-08-14_2_实时数据.xls
--- a/comData/weixin_solo/运行结果/2023-08-14_2_实时数据.xlsx
+++ b/comData/weixin_solo/运行结果/2023-08-14_2_实时数据.xlsx
--- a/comData/weixin_solo/运行结果/2023-08-14_实时数据.xls
+++ b/comData/weixin_solo/运行结果/2023-08-14_实时数据.xls
--- a/comData/weixin_solo/运行结果/2023-08-14_实时数据.xlsx
+++ b/comData/weixin_solo/运行结果/2023-08-14_实时数据.xlsx
--- a/comData/weixin_solo/错误biz/2023-08-142.xlsx
+++ b/comData/weixin_solo/错误biz/2023-08-142.xlsx