提交 e694b41b 作者: XveLingKun

5/10

上级 6da55a3e
...@@ -891,6 +891,7 @@ class BaseCore: ...@@ -891,6 +891,7 @@ class BaseCore:
page_size = doc.page_count page_size = doc.page_count
for page in doc.pages(): for page in doc.pages():
retData['content'] += page.get_text() retData['content'] += page.get_text()
except: except:
log = self.getLogger() log = self.getLogger()
log.error(f'文件损坏') log.error(f'文件损坏')
......
...@@ -324,7 +324,7 @@ def AnnualEnterprise(): ...@@ -324,7 +324,7 @@ def AnnualEnterprise():
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
print('=======') print('=======')
for item in gn_social_list: for item in gn_social_list:
r.rpush('AnnualEnterprise:gnqy_socialCode', item) r.rpush('AnnualEnterprise:zjh_socialCode', item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
#企业年报定时任务 #企业年报定时任务
...@@ -514,7 +514,7 @@ def NQEnterprise(): ...@@ -514,7 +514,7 @@ def NQEnterprise():
for item in nq_social_list: for item in nq_social_list:
#新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis #新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis
# r.rpush('NQEnterprise:nq_Ipo', item) # r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item) r.rpush('NQEnterprise:nq_finance', item)
# r.rpush('NQEnterprise:nq_notice',item) # r.rpush('NQEnterprise:nq_notice',item)
closeSql(cnx_, cursor_) closeSql(cnx_, cursor_)
...@@ -674,10 +674,11 @@ if __name__ == "__main__": ...@@ -674,10 +674,11 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
# MengZhi() # MengZhi()
NQEnterprise() # NQEnterprise()
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
AnnualEnterprise()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
......
...@@ -88,6 +88,12 @@ def doJob(): ...@@ -88,6 +88,12 @@ def doJob():
'version': 'TYC-Web' 'version': 'TYC-Web'
} }
cookies_list, id_cookie, user_name = token.get_cookies() cookies_list, id_cookie, user_name = token.get_cookies()
if cookies_list:
pass
else:
log.info("没有账号了,等待30分钟")
time.sleep(30 * 60)
return '', '', ''
log.info(f'=====当前使用的是{user_name}的cookie======') log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {} cookies = {}
for cookie in cookies_list: for cookie in cookies_list:
...@@ -97,7 +103,7 @@ def doJob(): ...@@ -97,7 +103,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
social_code = '91110108780992804C' social_code = '91370212MA3MJMA0XW'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
......
...@@ -26,7 +26,7 @@ if __name__ == "__main__": ...@@ -26,7 +26,7 @@ if __name__ == "__main__":
name = input('所属用户:') name = input('所属用户:')
driver = create_driver() driver = create_driver()
driver.get(url) driver.get(url)
time.sleep(80) time.sleep(60)
cookies = driver.get_cookies() cookies = driver.get_cookies()
# print(driver.get_cookies()) # print(driver.get_cookies())
......
title gwyRelevantDocuments
chcp 65001
cd /d %~dp0
python38 gwyRelevantDocuments.py
\ No newline at end of file
title gwyfile
chcp 65001
cd /d %~dp0
python38 gwyfile.py
\ No newline at end of file
title gwyparts
chcp 65001
cd /d %~dp0
python38 gwyparts.py
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
def query():
sql = "select id from wx_link where state = '300' order by publish_time desc"
cursor.execute(sql)
result = cursor.fetchall()
return result
if __name__ == "__main__":
result = query()
# result = ['1990264','1990265','1998085','1998086','2039312','2067942','2087699','2087700','2087701','2087774','2087775','2087776','2087777','2088091','2088092','2088093','2088445','2088446','2088447','2088455','2121977','2385237','2385238','2386227','2678376','2678377','2678421','2678422','2678425','2731944','2731945','2731946','2732184','2732185','2732205','2732206','2732313','2732314','2732317','2732318','2732319','2732320','2732321','2732323','2732438','2732439','2732440','2732453','2732455','2732456','2732483','2732497','2732910','2732911','2732912','2732913','2732915','2732918','2732952','2732953','2732958','2732959','2732960','2733052','2733053','2733097','2733100','2733101','2733120','2733121','2733123','2733124','2733127','2733128','2733130','2733146','2733147','2733148','2733149','2733150','2733151','2733152','2733153','2733154','2733155','2733156','2733157','2733328','2733345','2733346','2733515','2733518','2733519','2733534','2733536','2733537','2733565','2733566','2733595','2733596','2733598','2733627','2733703','2733705','2733706','2733814','2733958','2733959','2733960','2734035','2734062','2734113','2734180','2734182','2734270','2734271','2734272','2734273','2734274','2734275','2734276','2734307','2734308','2734311','2734312','2734313','2734314','2734315','2734316','2734317','2734324','2734325','2734326','2734328','2734329','2734330','2734339','2734340','2734341','2734388','2734389','2734536','2734537','2734538','2735181','2735182','2735183','2735184','2735185','2735186','2735187','2735188','2735190','2735191','2735194','2735196','2735266','2735267','2735268','2735269','2735270','2735271','2735272','2735276','2735277','2735278','2735279','2735280','2735281','2735282','2735283','2735297','2735561','2735625','2735627','2735628','2735662','2735663','2736211','2736212','2736213','2736214','2736215','2736216','2736544','2736545','2736546','2736559','2736677','2736678','2736817','2736819','2736820','2736821','2736823','2736824','2736825','2736828','2736905','2736906','2736907','2736912','2736913','2736914','2736915','2736916','2736917','2736963','2736964','2736988','2736989','2736990','2736991','2737108','2737111','2737600','2737601','2737604','2737701','2737702','2737703','2737759','2737928','2737930','2737931','2738046','2738050','2738051','2738052','2738053','2738356','2738357','2738358','2738456','2738460','2738461','2738485','2738486','2738607','2738608','2738609','2739613','2739614','2739615','2739649','2739650','2739651','2739908','2739909','2739910','2739911','2739912','2739913','2740019','2740022','2740023','2740123','2740207','2740208','2740209','2740252','2740255','2740256','2740269','2740270','2740271','2740412','2740413','2740485','2740486','2740487','2740535','2740536','2740537','2740538','2740539','2740540','2740541','2740542','2740543','2740544','2740545','2740546','2740547','2740548','2740549','2740659','2740660','2740661','2740662','2740663','2740664','2740924','2740926','2740927','2740964','2740965','2740966','2741091','2741092','2741093','2741098','2741099','2741100','2741129','2741130','2741131','2744702','2744703','2744704','2744705','2759363','2759364','2759365','2759546','2759547','2759548','2759549','2759550','2759551','2759552','2759553','2759554','2759555','2759556','2759800','2759801','2759802','2759803','2759805','2759806','2759829','2760062','2760063','2760064','2760729','2760730','2760733','2760899','2760900','2760902','2760903','2760904','2760905','2761327','2761328','2761332','2761783','2761784','2761785','2761795','2761797','2761799','2761805','2761819','2761820','2761821','2761822','2761891','2761892','2761893','2761894','2761895','2761896','2761897','2761900','2762070','2762071','2762072','2762073','2762074','2762075','2762076','2762077','2762078','2762079','2762080','2762081','2762082','2762083','2762084','2762085','2762087','2762088','2762089','2762090','2762091','2762092','2762093','2762125','2762126','2762127','2762137','2762138','2762139','2762160','2762161','2762162','2762195','2762196','2762197','2762410','2762411','2762419','2762470','2762471','2762472','2762873','2762875','2762877','2762930','2762931','2762937','2762938','2762939','2762940','2762941','2763276','2763277','2763278','2763304','2763305','2763307','2763308','2763309','2763310','2763312','2763697','2763698','2763699','2763700','2763701','2763702','2763703','2764035','2764036','2764037','2764039','2764040','2764041','2764042','2764043','2764044','2764045','2764046','2764047','2764048','2764049','2764050','2764051','2764055','2764056','2764057','2764059','2764060','2764062','2764063','2764064','2764065','2764164','2764165','2764369','2764370','2764567','2764568','2764570','2764618','2764619','2764620','2764744','2764745','2764748','2764770','2764771','2764772','2764869','2764870','2764871','2764898','2764899','2764900','2764901','2764902','2764903','2764904','2764905','2764906','2764907','2764908','2764909','2764910','2764911','2764912','2764913','2764914','2764915','2764916','2764917','2764918','2764933','2764934','2764935','2764936','2764937','2764938','2764939','2764957','2764958','2764959','2764960','2764961','2764963','2764964','2764965','2764966','2765020','2765021','2765022','2765023','2765024','2765026','2765229','2765230','2765231','2765232','2765233','2765293','2765294','2765295','2765296','2765297','2765298','2765299','2765300','2765301','2765302','2765303','2765304','2765305','2765306','2765307','2765308','2765414','2765416','2765424','2765571','2765572','2765573','2765796','2765797','2765798','2765804','2765805','2765807','2765808','2765809','2765810','2765811','2765812','2765813','2765814','2765815','2765816','2765820','2765821','2765822','2766021','2766022','2766023','2766024','2766025','2766048','2766060','2766061','2766062','2766063','2766064','2766066','2766068','2766069','2766071','2766072','2766073','2766074','2766075','2766169','2766194','2766195','2766197','2766208','2766209','2766244','2766245','2766246','2766536','2766537','2766538','2766539','2766540','2766547','2766669','2766670','2766671','2766673','2766674','2766675','2766676','2766677','2766678','2766679','2766680','2766681','2766682','2766790','2766792','2766826','2766827','2767032','2767120','2767121','2767122','2767123','2767126','2767127','2767128','2767129','2767130','2767131','2767132','2767133','2767134','2767135','2767136','2767137','2767138','2767139','2767173','2767174','2767408','2767409','2767410','2767411','2767502','2767503','2767534','2767535','2767545','2767546','2767547','2767548','2767600','2767602','2767642','2767643','2767655','2767656','2767717','2767718','2767719','2767720','2767732','2767740','2767741','2767756','2767758','2767766','2767767','2767807','2767808','2767809','2767810','2767817','2767818','2767825','2767827','2767828','2767829','2767840','2767887','2767898','2767899','2767900','2767901','2767902','2767903','2767906','2767907','2767908','2767955','2768155','2768156','2768166','2768167','2768168','2768170','2768284','2768360','2768368','2768378','2768826','2768827','2768845','2768846','2768847','2768848','2768849','2768850','2768851','2768852','2768871','2768872','2768877','2768878','2768879','2768880','2768888','2768912','2768913','2768914','2768916','2768917','2768918','2768919','2768920','2768921','2768922','2768923','2768924','2769071','2769075','2769240','2769258','2769270','2769271','2769272','2769399','2769400','2769427','2769428','2769429','2769430','2769489','2769491','2769542','2769543','2769568','2769569','2770721','2770722','2770723','2770724','2770725','2770726','2770728','2770729','2770730','2770731','2770732','2770733','2770734','2770738','2770739','2770793','2770796','2770903','2770904','2770906','2770907','2770909','2770977','2770978','2771278','2771280','2771661','2771662','2771929','2771932','2772086','2772087','2772088','2772089','2772090','2772130','2772132','2772319','2772320','2772409','2772410','2772423','2772424','2772425','2772426','2772630','2772632','2772721','2772723','2772724','2772725','2772737','2772738','2772749','2772750','2772751','2772752','2772753','2773224','2773227','2773253','2773254','2773287','2773289','2773346','2773347','2773348','2773349','2773350','2773351','2773385','2773386','2773387','2773388','2773558','2773559','2773563','2773564','2773565','2773566','2773567','2773568','2773777','2773778','2773805','2773806','2773808','2773834','2773836','2773837','2773838','2773839','2773841','2774222','2774223','2774302','2774303','2774319','2774320','2774334','2774335','2774440','2774512','2774513','2774527','2774528','2774529','2774531','2774660','2774807','2774830','2774847','2775022','2775023','2775164','2775165','2775350','2775351','2775386','2775387','2775482','2775483','2775677','2775680','2776678','2776681','2776869','2776872','2776873','2776881','2776882','2776883','2776884','2776885','2776993','2776994','2776996','2776997','2777106','2777115','2777123','2777124','2777125','2777126','2777127','2777128','2777129','2777130','2777131','2777132','2777237','2777240','2777814','2777815','2777817','2777819','2777820','2777821','2777822','2777823','2777824','2777825','2777826','2777827','2777828','2777829','2777830','2778099','2778100','2778267','2778268','2778484','2778486','2778601','2778603','2778707','2778708','2779029','2779030','2779031','2779032','2779033','2779034','2779035','2779047','2779048','2779253','2779369','2779371','2779528','2779529','2779530','2779533','2779591','2779592','2779790','2779791','2780196','2780501','2780504']
for id in result:
# 放入redis
r.rpush("WeiXinGZH:linkid_fail", id[0])
# r.rpush("WeiXinGZH:linkid_fail", id)
\ No newline at end of file
"""获取每天失败的列表--返回给数据组"""
import datetime
import time
import pandas as pd
import smtplib
from email.header import Header
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import pymysql
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# cnx = baseCore.cnx
# cursor = baseCore.cursor
def sendEmail(file_name, receiver, filename):
file = open(file_name, 'rb').read()
# 发送邮箱地址
sender = '1195236739@qq.com'
# 发送邮箱登录 账户 密码
username = '1195236739@qq.com'
password = 'gatvszshadvpgjci'
smtpserver = 'smtp.qq.com'
# # 接收邮箱地址
# receiver = 'fujunxue@ciglobal.cn'
maile_title = filename
message = MIMEMultipart()
message['From'] = sender
message['To'] = receiver
message['Subject'] = Header(maile_title, 'utf-8')
message.attach(MIMEText(filename, 'plain', 'utf-8'))
xlsxApart = MIMEApplication(file)
xlsxApart.add_header('Content-Disposition', 'attachment', filename=filename)
message.attach(xlsxApart)
smtpObj = smtplib.SMTP_SSL(smtpserver) # 注意:如果遇到发送失败的情况(提示远程主机拒接连接),这里要使用SMTP_SSL方法
smtpObj.connect(smtpserver, port=465)
smtpObj.login(username, password)
smtpObj.sendmail(sender, receiver, message.as_string())
print("邮件发送成功!!!")
smtpObj.quit()
# 解析失败
def get_failed_list(today_time):
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
sql = f"select * from wx_link where (state = '300' or state = '600' and state = '200') and create_time >= '{today_time}' "
# sql = f"select * from wx_link where state='800' "
print(sql)
cursor.execute(sql)
result = cursor.fetchall()
cursor.close()
cnx.close()
return result
# 发布内容不存在
def get_null_list(today_time):
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
sql = f"select * from wx_link where (state='800' or state='500') and create_time >= '{today_time}' "
cursor.execute(sql)
result = cursor.fetchall()
cursor.close()
cnx.close()
return result
def get_info(result):
fail_list = []
for info in result:
site_name = info[3] # 公众号
info_source_code = info[4]
title = info[5]
publish_time = info[6]
link = info[7]
# 写入detaframe
# 创建一个字典,其中包含当前行的数据
row = {
'公众号': site_name,
'公众号编码': info_source_code,
'标题': title,
'发布时间': publish_time,
'链接': link
}
fail_list.append(row)
return fail_list
if __name__ == "__main__":
# 创建一个空的DataFrame,其中包含你需要的列名
while True:
# 计算今天的时间
now = datetime.datetime.now()
print(now)
time.sleep(1)
print(datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0))
if now >= datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0):
pass
else:
continue
today_time = str(now.strftime("%Y-%m-%d 00:00:00"))
print(today_time)
result = get_failed_list(today_time)
result_null = get_null_list(today_time)
if result:
fail_list = get_info(result)
result_df = pd.DataFrame(fail_list)
result_df.to_excel(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", index=False)
sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号采集失败列表")
sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "mr@ciglobal.cn", "微信公众号采集失败列表")
else:
log.info(f'{today_time} 没有采集失败的文章')
if result_null:
fail_list = get_info(result)
null_list = get_info(result_null)
null_df = pd.DataFrame(fail_list)
null_df.to_excel(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", index=False)
sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号文章内容为空列表")
sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "mr@ciglobal.cn", "微信公众号文章内容为空列表")
else:
log.info(f'{today_time} 没有采集到空的文章')
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
''' '''
成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700 成功100 发送数据失败200 请求失败400 文章内容为空500 处理style标签失败700 发布内容不存在800 图片处理失败300、600
''' '''
import re import re
...@@ -118,6 +118,10 @@ def get_info(dict_json, linkid): ...@@ -118,6 +118,10 @@ def get_info(dict_json, linkid):
# updatewxLink(url_news, info_source_code, 400) # updatewxLink(url_news, info_source_code, 400)
return False return False
soup_news = BeautifulSoup(res_news.content, 'html.parser') soup_news = BeautifulSoup(res_news.content, 'html.parser')
if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
log.info(f'--errorCode:800--{origin}---{news_date}---{news_title}----内容无法查看')
updatewxLink(url_news, info_source_code, 800)
return False
try: try:
news_html = soup_news.find('div', {'id': 'js_content'}) news_html = soup_news.find('div', {'id': 'js_content'})
news_html['style'] = 'width: 814px ; margin: 0 auto;' news_html['style'] = 'width: 814px ; margin: 0 auto;'
...@@ -228,7 +232,7 @@ def get_info(dict_json, linkid): ...@@ -228,7 +232,7 @@ def get_info(dict_json, linkid):
} }
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 7, 0))
kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
kafka_time_out = kafka_result.get(timeout=10) kafka_time_out = kafka_result.get(timeout=10)
# add_url(sid, url_news) # add_url(sid, url_news)
...@@ -252,7 +256,7 @@ def get_info(dict_json, linkid): ...@@ -252,7 +256,7 @@ def get_info(dict_json, linkid):
} }
for nnn2 in range(0, 3): for nnn2 in range(0, 3):
try: try:
producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2,7,0))
kafka_result2 = producer2.send("collectionAndDispatcherInfo", kafka_result2 = producer2.send("collectionAndDispatcherInfo",
json.dumps(dic_info2, ensure_ascii=False).encode('utf8')) json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
break break
......
import time import re
import pandas as pd import requests
from bs4 import BeautifulSoup
# def writeaa(): from retry import retry
# detailList=[]
# aa={
# 'id':3,
# 'name':'qqqwe'
# }
# detailList.append(aa)
# writerToExcel(detailList)
# 将数据追加到excel
# def writerToExcel(detailList):
# # filename='baidu搜索.xlsx'
# # 读取已存在的xlsx文件
# existing_data = pd.read_excel(filename,engine='openpyxl')
# # 创建新的数据
# new_data = pd.DataFrame(data=detailList)
# # 将新数据添加到现有数据的末尾
# combined_data = existing_data.append(new_data, ignore_index=True)
# # 将结果写入到xlsx文件
# combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
# gpdm = '01109.HK'
# if 'HK' in str(gpdm):
# tmp_g = str(gpdm).split('.')[0]
# if len(tmp_g) == 5:
# gpdm = str(gpdm)[1:]
# print(gpdm)
# else:
# pass
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
r = baseCore.r @retry(tries=3,delay=2)
def getrequest(url_news):
# #自增并设置过期时间
# while True: ip = baseCore.get_proxy()
# key = 'mykey' res_news = requests.get(url_news, proxies=ip, timeout=20)
# expiration_time = 60 # 设置过期时间 60秒 if res_news.status_code != 200:
# #设置自增 raise
# r.incr(key) return res_news
#
#
# def rm_style_attr(soup):
# value = int(r.get(key).decode()) # 查找所有含有style属性的标签
# style_tags = soup.find_all(style=True)
# if value > 10: # 遍历每个style标签
# print(value) for style_tag in style_tags:
# # 设置过期时间 try:
# r.expire(key, expiration_time) # 使用正则表达式替换
# styleattr = style_tag['style']
# time.sleep(70) styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
# print('------------------') styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
# continue styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
# # print(value) styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
# style_tag['style'] = styleattr
# print("==========") except:
# expiration_time = 60 continue
# # 创建PubSub对象
# p = r.pubsub() # first_div = soup.select('div[id="js_content"]')
# # # 设置style属性
# # 订阅过期事件 # first_div['style'] = 'width: 814px ; margin: 0 auto;'
# p.psubscribe('__keyevent@6__:expired')
# aa = p.listen() first_div = soup.select('div[id="js_content"]')
# # 监听过期事件 if first_div:
# for message in p.listen(): first_div = first_div[0] # 获取第一个匹配的元素
# if message['type'] == 'pmessage': first_div['style'] = 'width: 814px ; margin: 0 auto;' # 设置style属性
# expired_key = message['data'].decode()
# print('过期的key:', expired_key) return soup
# if expired_key == 'counter':
# # 执行重置操作
# r.set('counter', 0) if __name__ == "__main__":
# print('计数器已重置为0') # url_news = "http://mp.weixin.qq.com/s?__biz=MjM5NDMxOTMwNg==&mid=2653175413&idx=1&sn=8c0853ddab6e27799c4452e0b6e63156&chksm=bd5900d08a2e89c698de51f102b7423b33a27522966ca2218ca1b8ef290837b0087173c74bcb#rd"
# # 设置自增 url_news = "http://mp.weixin.qq.com/s?__biz=MzU4ODQwNTIxMw==&mid=2247528290&idx=4&sn=370655b44dfd31b99984e2eeeb4868e0&chksm=fddf6fd0caa8e6c63a0b5e4fece250415fcb56f03f305450b1434978769b443eaa416342326e#rd"
# r.incr('counter') # 修改请求方法,retry 3次
# # 设置过期时间 try:
# r.expire('counter', expiration_time) res_news = getrequest(url_news)
# print(res_news)
for i in range(0, 24, 5): except:
print(i) try:
\ No newline at end of file res_news = requests.get(url_news, timeout=20)
print('请求成功')
except:
res_news = None
pass
soup_news = BeautifulSoup(res_news.content, 'html.parser')
if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
print('失败')
try:
news_html = soup_news.find('div', {'id': 'js_content'})
news_html['style'] = 'width: 814px ; margin: 0 auto;'
#del news_html['style']
news_html = rm_style_attr(news_html)
# print(news_html)
del news_html['id']
del news_html['class']
except Exception as e:
print(e)
news_html = None
# print(news_html)
news_content = news_html.text
list_img = news_html.find_all('img')
for num_img in range(len(list_img)):
img_one = list_img[num_img]
url_src = img_one.get('data-src')
# print(url_src)
if url_src and 'gif' in url_src:
url_img = ''
img_one.extract()
else:
try:
try:
name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
except:
img_one.extract()
continue
try:
res = requests.get(url_src, timeout=20)
except:
img_one.extract()
continue
except Exception as e:
print(f'--error--{url_news}-----------{e}')
for tag in news_html.descendants:
try:
del tag['style']
except:
pass
list_section = news_html.find_all('section')
for section in list_section:
section.name = 'div'
print(news_html)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论