提交 a492eb60 作者: 薛凌堃

微信公众号获取正文

上级 a9ad4913
...@@ -4,18 +4,14 @@ ...@@ -4,18 +4,14 @@
''' '''
import requests, time, random, json, pymysql, redis import requests, time, random, json, pymysql, redis
import pandas as pd
import urllib3 import urllib3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
from obs import ObsClient from obs import ObsClient
from kafka import KafkaProducer from kafka import KafkaProducer
# logging.basicConfig(filename='example.log', level=logging.INFO)
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
import os
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
...@@ -25,48 +21,6 @@ cursor_ = baseCore.cursor ...@@ -25,48 +21,6 @@ cursor_ = baseCore.cursor
r = baseCore.r r = baseCore.r
urllib3.disable_warnings() urllib3.disable_warnings()
def check_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sismember(f'wx_url_{sid}',article_url)
if res == 1:
return True
else:
return False
def add_url(sid, article_url):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
res = r.sadd(f'wx_url_{sid}', article_url, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# #定时
# def getFromSql():
# selectSql = "SELECT info_source_code from info_source where site_uri like '%mp.weixin.qq.com%'"
# cursor.execute(selectSql)
# results = cursor.fetchall()
# result_list = [item[0] for item in results]
#
# #放入redis
# for item in result_list:
# r.rpush('WeiXinGZH:infoSourceCode', item)
#
# #刷新浏览器并获得token
# def flushAndGetToken(list_b):
# browser_run = list_b[0]
# log.info('======刷新浏览器=====')
# browser_run.refresh()
# cookie_list = browser_run.get_cookies()
# cur_url = browser_run.current_url
# token = cur_url.split('token=')[1]
# log.info(f'===========当前token为:{token}============')
# cookies = {}
# for cookie in cookie_list:
# cookies[cookie['name']] = cookie['value']
# return token,cookies
#采集失败的公众号 重新放入redis
def rePutIntoR(item): def rePutIntoR(item):
r.rpush('WeiXinGZH:infoSourceCode', item) r.rpush('WeiXinGZH:infoSourceCode', item)
...@@ -165,22 +119,33 @@ def get_info(dict_json): ...@@ -165,22 +119,33 @@ def get_info(dict_json):
img_one.extract() img_one.extract()
else: else:
try: try:
name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1] try:
except: name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
img_one.extract() except:
continue img_one.extract()
try: continue
res = requests.get(url_src, timeout=20) try:
except: res = requests.get(url_src, timeout=20)
img_one.extract() except:
resp = obsClient.putContent('zzsn', name_img, content=res.content) img_one.extract()
continue
url_img = resp['body']['objectUrl'] resp = obsClient.putContent('zzsn', name_img, content=res.content)
str_url_img = f'<img src="{url_img}">' try:
try: url_img = resp['body']['objectUrl']
img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img) str_url_img = f'<img src="{url_img}">'
except Exception as e:
log.info(f'--error--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300)
return False
try:
img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)
except Exception as e:
log.info(f'--error--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 300)
return False
except Exception as e: except Exception as e:
log.info(f'----{url_news}-----------{e}') log.info(f'--error--{url_news}-----------{e}')
updatewxLink(url_news, info_source_code, 600)
return False return False
for tag in news_html.descendants: for tag in news_html.descendants:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论