提交 5dc4e829 作者: 薛凌堃

微信公众号

上级 060ce7c4
...@@ -6,16 +6,17 @@ import sys ...@@ -6,16 +6,17 @@ import sys
import time import time
import logbook import logbook
import logbook.more import logbook.more
import pandas as pd
import zhconv import zhconv
import pymysql import pymysql
import redis import redis
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
import langid
class BaseCore: class BaseCore:
...@@ -475,6 +476,16 @@ class BaseCore: ...@@ -475,6 +476,16 @@ class BaseCore:
return 'cn' return 'cn'
return result[0] return result[0]
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
...@@ -6,6 +6,7 @@ import requests, time, random, json, pymysql, redis ...@@ -6,6 +6,7 @@ import requests, time, random, json, pymysql, redis
import pandas as pd import pandas as pd
import urllib3 import urllib3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver from selenium import webdriver
from obs import ObsClient from obs import ObsClient
from kafka import KafkaProducer from kafka import KafkaProducer
...@@ -13,6 +14,7 @@ from kafka import KafkaProducer ...@@ -13,6 +14,7 @@ from kafka import KafkaProducer
# logging.basicConfig(filename='example.log', level=logging.INFO) # logging.basicConfig(filename='example.log', level=logging.INFO)
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
import os
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -22,7 +24,7 @@ urllib3.disable_warnings() ...@@ -22,7 +24,7 @@ urllib3.disable_warnings()
def check_url(sid, article_url): def check_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn') r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sismember(f'wx_url_{sid}', article_url) # 注意是 保存set的方式 res = r.sismember(f'wx_url_{sid}', article_url) # 注意是 保存set的方式
if res == 1: # 若返回0,说明插入不成功,表示有重复 if res == 1:
return True return True
else: else:
return False return False
...@@ -79,9 +81,9 @@ def get_info(json_search): ...@@ -79,9 +81,9 @@ def get_info(json_search):
url_news = one_news['link'] url_news = one_news['link']
url_ft = check_url(sid, url_news) # url_ft = check_url(sid, url_news)
if url_ft: # if url_ft:
return list_all_info,url_news,news_title # return list_all_info,url_news,news_title
try: try:
res_news = requests.get(url_news, timeout=20) res_news = requests.get(url_news, timeout=20)
except: except:
...@@ -147,16 +149,16 @@ def get_info(json_search): ...@@ -147,16 +149,16 @@ def get_info(json_search):
'source': '11', 'source': '11',
'createDate': time_now 'createDate': time_now
} }
for nnn in range(0, 3): # for nnn in range(0, 3):
try: # try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) # producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8')) # kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_time_out = kafka_result.get(timeout=10) # kafka_time_out = kafka_result.get(timeout=10)
add_url(sid, url_news) # # add_url(sid, url_news)
break # break
except: # except:
time.sleep(5) # time.sleep(5)
continue # continue
num_caiji = num_caiji + 1 num_caiji = num_caiji + 1
list_all_info.append(dic_info) list_all_info.append(dic_info)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -169,15 +171,15 @@ def get_info(json_search): ...@@ -169,15 +171,15 @@ def get_info(json_search):
'dispatcherStatus': '1', 'dispatcherStatus': '1',
'source': '1', 'source': '1',
} }
for nnn2 in range(0, 3): # for nnn2 in range(0, 3):
try: # try:
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) # producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result2 = producer2.send("collectionAndDispatcherInfo", # kafka_result2 = producer2.send("collectionAndDispatcherInfo",
json.dumps(dic_info2, ensure_ascii=False).encode('utf8')) # json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
break # break
except: # except:
time.sleep(5) # time.sleep(5)
continue # continue
return list_all_info,url_news,news_title return list_all_info,url_news,news_title
if __name__=="__main__": if __name__=="__main__":
...@@ -227,7 +229,8 @@ if __name__=="__main__": ...@@ -227,7 +229,8 @@ if __name__=="__main__":
cookies[cookie['name']] = cookie['value'] cookies[cookie['name']] = cookie['value']
s = requests.session() s = requests.session()
# 记录运行公众号的个数
count = 0
while True: while True:
all = [] all = []
list_all_info = [] list_all_info = []
...@@ -306,14 +309,13 @@ if __name__=="__main__": ...@@ -306,14 +309,13 @@ if __name__=="__main__":
fakeid = biz + '==' fakeid = biz + '=='
url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1' url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
#记录运行公众号的个数
count = 0
try: try:
ip = get_proxy()[random.randint(0, 3)] ip = get_proxy()[random.randint(0, 3)]
json_search = s.get(url_search, headers=headers, proxies=ip, json_search = s.get(url_search, headers=headers, proxies=ip,
verify=False).json() # , proxies=ip, verify=False verify=False).json() # , proxies=ip, verify=False
time.sleep(2) time.sleep(2)
break
except: except:
log.info(f'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}===') log.info(f'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}===')
# error_text = str(json_search) # error_text = str(json_search)
...@@ -340,9 +342,18 @@ if __name__=="__main__": ...@@ -340,9 +342,18 @@ if __name__=="__main__":
# df_error_biz.to_excel(f'./错误biz/{excel_name}.xlsx', index=False) # df_error_biz.to_excel(f'./错误biz/{excel_name}.xlsx', index=False)
#改为: #改为:
with pd.ExcelWriter(f'./错误biz/{excel_name}2.xlsx', engine='xlsxwriter', file_path = f'./错误biz/{excel_name}.xlsx'
options={'strings_to_urls': False}) as writer: if os.path.exists(file_path):
df_error_biz.to_excel(writer, index=False) pass
else:
workbook = Workbook()
workbook.save(file_path)
workbook.close()
# with pd.ExcelWriter(file_path, engine='xlsxwriter',
# options={'strings_to_urls': False}) as writer:
baseCore.writerToExcel(df_error_biz, file_path)
# combined_data.to_excel(writer, index=False)
bb = time.sleep(3600) bb = time.sleep(3600)
log.info(f'========当前账号可能被封,等待时长{bb}======') log.info(f'========当前账号可能被封,等待时长{bb}======')
...@@ -363,25 +374,42 @@ if __name__=="__main__": ...@@ -363,25 +374,42 @@ if __name__=="__main__":
try: try:
list_all_info,url_news,news_title = get_info(json_search) list_all_info,url_news,news_title = get_info(json_search)
time.sleep(10) time.sleep(2)
count += 1 count += 1
if len(list_all_info): if len(list_all_info):
for dic_one in list_all_info: for dic_one in list_all_info:
all.append(dic_one) all.append(dic_one)
df_info = pd.DataFrame(all) # df_info = pd.DataFrame(all)
excel_name = time.strftime("%Y-%m-%d", time.localtime()) excel_name = time.strftime("%Y-%m-%d", time.localtime())
try: try:
file_path = f'./运行结果/{excel_name}_实时数据.xlsx'
if os.path.exists(file_path):
pass
else:
workbook = Workbook()
workbook.save(file_path)
workbook.close()
# df_info.to_excel(f'./运行结果/{excel_name}_实时数据.xlsx', index=False) # df_info.to_excel(f'./运行结果/{excel_name}_实时数据.xlsx', index=False)
with pd.ExcelWriter(f'./运行结果/{excel_name}_实时数据.xlsx', engine='xlsxwriter', # with pd.ExcelWriter(file_path, engine='xlsxwriter',
options={'strings_to_urls': False}) as writer: # options={'strings_to_urls': False}) as writer:
df_info.to_excel(writer, index=False) baseCore.writerToExcel(all,file_path)
# combined_data.to_excel(writer, index=False)
except: except:
file_path = f'./运行结果/{excel_name}_2_实时数据.xlsx'
if os.path.exists(file_path):
pass
else:
workbook = Workbook()
workbook.save(file_path)
workbook.close()
# df_info.to_excel(f'./运行结果/{excel_name}_2_实时数据.xlsx', index=False) # df_info.to_excel(f'./运行结果/{excel_name}_2_实时数据.xlsx', index=False)
with pd.ExcelWriter(f'./运行结果/{excel_name}_2_实时数据.xlsx', engine='xlsxwriter', # with pd.ExcelWriter(file_path, engine='xlsxwriter',
options={'strings_to_urls': False}) as writer: # options={'strings_to_urls': False}) as writer:
df_info.to_excel(writer, index=False) baseCore.writerToExcel(all, file_path)
# combined_data.to_excel(writer, index=False)
# 该公众号的所有文章采集完成 # 该公众号的所有文章采集完成
# print(f'{fakeid}:采集成功!') # print(f'{fakeid}:采集成功!')
log.info(f'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号') log.info(f'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号')
...@@ -401,9 +429,18 @@ if __name__=="__main__": ...@@ -401,9 +429,18 @@ if __name__=="__main__":
df_error_url = pd.DataFrame({'公众号:': get_error_origin, df_error_url = pd.DataFrame({'公众号:': get_error_origin,
'code': get_error_code, 'code': get_error_code,
'信息': list_error_url}) '信息': list_error_url})
file_path = f'./保存失败/{excel_name}.xlsx'
if os.path.exists(file_path):
pass
else:
workbook = Workbook()
workbook.save(file_path)
workbook.close()
# df_error_url.to_excel(f'./保存失败/{excel_name}.xlsx', index=False) # df_error_url.to_excel(f'./保存失败/{excel_name}.xlsx', index=False)
with pd.ExcelWriter(f'./保存失败/{excel_name}.xlsx',engine='xlsxwriter',options={'strings_to_urls':False}) as writer: # with pd.ExcelWriter(file_path,engine='xlsxwriter',options={'strings_to_urls':False}) as writer:
df_error_url.to_excel(writer,index=False) baseCore.writerToExcel(df_error_url, file_path)
# combined_data.to_excel(writer,index=False)
time.sleep(1) time.sleep(1)
else: else:
...@@ -418,24 +455,19 @@ if __name__=="__main__": ...@@ -418,24 +455,19 @@ if __name__=="__main__":
df_error_json = pd.DataFrame({'公众号:': json_error_origin, df_error_json = pd.DataFrame({'公众号:': json_error_origin,
'code': json_error_code, 'code': json_error_code,
'信息': json_error_biz}) '信息': json_error_biz})
file_path = f'./错误文件/{time_end}.xlsx'
if os.path.exists(file_path):
pass
else:
workbook = Workbook()
workbook.save(file_path)
workbook.close()
# df_error_json.to_excel(f'./错误文件/{time_end}.xlsx', index=False) # df_error_json.to_excel(f'./错误文件/{time_end}.xlsx', index=False)
with pd.ExcelWriter(f'./错误文件/{time_end}.xlsx', engine='xlsxwriter', # with pd.ExcelWriter(file_path, engine='xlsxwriter',
options={'strings_to_urls': False}) as writer: # options={'strings_to_urls': False}) as writer:
df_error_json.to_excel(writer, index=False) baseCore.writerToExcel(df_error_json, file_path)
# combined_data.to_excel(writer, index=False)
# error_text_txt = fakeid
# with open(f'./错误文件/{time_end}.txt', 'w') as f:
# f.write(error_text_txt)
# time.sleep(2)
# browser_run = list_b[0]
# browser_run.refresh()
# cookie_list = browser_run.get_cookies()
# cur_url = browser_run.current_url
# token = cur_url.split('token=')[1]
# print(token)
# cookies = {}
# for cookie in cookie_list:
# cookies[cookie['name']] = cookie['value']
time_end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.info(f'运行结束,时间为:{time_end}') log.info(f'运行结束,时间为:{time_end}')
......
import pandas as pd
def writeaa():
detailList=[]
aa={
'id':3,
'name':'qqqwe'
}
detailList.append(aa)
writerToExcel(detailList)
# 将数据追加到excel
def writerToExcel(detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
from openpyxl import Workbook
if __name__ == '__main__':
filename='test1.xlsx'
# # 创建一个工作簿
workbook = Workbook(filename)
workbook.save(filename)
writeaa()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论