提交 ca6cfd8d 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

......@@ -2,6 +2,7 @@
'''
成功100 发送数据失败200 请求失败400 文章内容为空500
'''
import re
import requests, time, random, json, pymysql, redis
......@@ -32,7 +33,7 @@ def updatewxLink(link,info_source_code,state):
def getjsonInfo():
#从数据库中获取信息 一条
select_sql = "select * from wx_link where state=0 order by id asc limit 1"
select_sql = "select * from wx_link where state=100 order by id asc limit 1"
cursor_.execute(select_sql)
row = cursor_.fetchone()
cnx_.commit()
......@@ -85,8 +86,10 @@ def get_info(dict_json):
soup_news = BeautifulSoup(res_news.content, 'html.parser')
news_html = soup_news.find('div', {'id': 'js_content'})
news_html['style'] = 'width: 814px ; margin: 0 auto;'
try:
del news_html['style']
#del news_html['style']
news_html=rm_style_attr(news_html)
del news_html['id']
del news_html['class']
except:
......@@ -209,6 +212,27 @@ def get_info(dict_json):
updatewxLink(url_news,info_source_code,100)
return True
def rm_style_attr(soup):
# 查找所有含有style属性的标签
style_tags = soup.find_all(style=True)
# 遍历每个style标签
for style_tag in style_tags:
try:
# 使用正则表达式替换
styleattr = style_tag['style']
styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
style_tag['style'] = styleattr
except:
continue
first_div = soup.select('div[id="js_content"]')
# 设置style属性
first_div['style'] = 'width: 814px ; margin: 0 auto;'
return soup
if __name__=="__main__":
num_caiji = 0
list_all_info = []
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论