提交 54d3083a 作者: XveLingKun

微信公众号采集

上级 f434a907
...@@ -24,7 +24,7 @@ r = baseCore.r ...@@ -24,7 +24,7 @@ r = baseCore.r
urllib3.disable_warnings() urllib3.disable_warnings()
def rePutIntoR(item): def rePutIntoR(item):
r.rpush('WeiXinGZH:infoSourceCode', item) r.rpush('WeiXinGZH:linkid', item)
def updatewxLink(link,info_source_code,state): def updatewxLink(link,info_source_code,state):
updateSuccess = f"update wx_link set state= {state} where link='{link}' and info_source_code='{info_source_code}' " updateSuccess = f"update wx_link set state= {state} where link='{link}' and info_source_code='{info_source_code}' "
...@@ -39,7 +39,7 @@ def getjsonInfo(): ...@@ -39,7 +39,7 @@ def getjsonInfo():
pass pass
else: else:
log.info('-----没有数据了-----') log.info('-----没有数据了-----')
return False return False, False
#从数据库中获取信息 一条 #从数据库中获取信息 一条
select_sql = f"select * from wx_link where state=0 and id= '{linkid}'" select_sql = f"select * from wx_link where state=0 and id= '{linkid}'"
cursor_.execute(select_sql) cursor_.execute(select_sql)
...@@ -49,7 +49,7 @@ def getjsonInfo(): ...@@ -49,7 +49,7 @@ def getjsonInfo():
pass pass
else: else:
log.info('-----没有数据了-----') log.info('-----没有数据了-----')
return False return False, False
dict_json = { dict_json = {
'sid':row[1], 'sid':row[1],
'site_uri':row[2], 'site_uri':row[2],
...@@ -63,7 +63,7 @@ def getjsonInfo(): ...@@ -63,7 +63,7 @@ def getjsonInfo():
update_sql = f"update wx_link set state=1 where link='{row[7]}' and info_source_code='{row[4]}' " update_sql = f"update wx_link set state=1 where link='{row[7]}' and info_source_code='{row[4]}' "
cursor_.execute(update_sql) cursor_.execute(update_sql)
cnx_.commit() cnx_.commit()
return dict_json return dict_json, linkid
@retry(tries=3,delay=2) @retry(tries=3,delay=2)
def getrequest(url_news): def getrequest(url_news):
...@@ -72,7 +72,7 @@ def getrequest(url_news): ...@@ -72,7 +72,7 @@ def getrequest(url_news):
res_news = requests.get(url_news, proxies=ip, timeout=20) res_news = requests.get(url_news, proxies=ip, timeout=20)
if res_news.status_code != 200: if res_news.status_code != 200:
raise raise
return res_news
def get_info(dict_json): def get_info(dict_json):
# list_all_info = [] # list_all_info = []
...@@ -102,6 +102,7 @@ def get_info(dict_json): ...@@ -102,6 +102,7 @@ def get_info(dict_json):
# 修改请求方法,retry 3次 # 修改请求方法,retry 3次
try: try:
res_news = getrequest(url_news) res_news = getrequest(url_news)
# print(res_news)
except: except:
try: try:
res_news = requests.get(url_news, timeout=20) res_news = requests.get(url_news, timeout=20)
...@@ -118,7 +119,7 @@ def get_info(dict_json): ...@@ -118,7 +119,7 @@ def get_info(dict_json):
news_html = rm_style_attr(news_html) news_html = rm_style_attr(news_html)
del news_html['id'] del news_html['id']
del news_html['class'] del news_html['class']
except: except Exception as e:
log.error(f'{url_news}-----{info_source_code}') log.error(f'{url_news}-----{info_source_code}')
return False return False
try: try:
...@@ -255,9 +256,15 @@ def rm_style_attr(soup): ...@@ -255,9 +256,15 @@ def rm_style_attr(soup):
except: except:
continue continue
# first_div = soup.select('div[id="js_content"]')
# # 设置style属性
# first_div['style'] = 'width: 814px ; margin: 0 auto;'
first_div = soup.select('div[id="js_content"]') first_div = soup.select('div[id="js_content"]')
# 设置style属性 if first_div:
first_div['style'] = 'width: 814px ; margin: 0 auto;' first_div = first_div[0] # 获取第一个匹配的元素
first_div['style'] = 'width: 814px ; margin: 0 auto;' # 设置style属性
return soup return soup
if __name__=="__main__": if __name__=="__main__":
...@@ -267,12 +274,15 @@ if __name__=="__main__": ...@@ -267,12 +274,15 @@ if __name__=="__main__":
#一次拿取一篇文章 #一次拿取一篇文章
# todo: 从redis拿数据 更新mysql状态 # todo: 从redis拿数据 更新mysql状态
dict_json =getjsonInfo() dict_json, linkid =getjsonInfo()
if dict_json: try:
if get_info(dict_json): if dict_json:
num_caiji = num_caiji + 1 if get_info(dict_json):
log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----') num_caiji = num_caiji + 1
else: log.info(f'-----已采集{num_caiji}篇文章---来源{dict_json["site_name"]}----')
continue else:
continue
except:
rePutIntoR(linkid)
baseCore.close() baseCore.close()
\ No newline at end of file
# -*- coding: utf-8 -*-
import re
import time import time
import pandas as pd import pandas as pd
...@@ -40,6 +43,7 @@ import pandas as pd ...@@ -40,6 +43,7 @@ import pandas as pd
# else: # else:
# pass # pass
import redis import redis
from bs4 import BeautifulSoup
def check_url(): def check_url():
...@@ -61,11 +65,51 @@ def test(dic_user_count): ...@@ -61,11 +65,51 @@ def test(dic_user_count):
return dic_user_count return dic_user_count
def test1(): def test1():
dic_user_count = {"A":0} dic_user_count = {}
for i in range(3): for i in range(3):
dic_user_count = test(dic_user_count) dic_user_count = test(dic_user_count)
print(dic_user_count) print(dic_user_count)
def rm_style_attr(soup):
# 查找所有含有style属性的标签
style_tags = soup.find_all(style=True)
# 遍历每个style标签
for style_tag in style_tags:
try:
# 使用正则表达式替换
styleattr = style_tag['style']
styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
style_tag['style'] = styleattr
except:
continue
first_div = soup.select('div[id="js_content"]')
if first_div:
first_div = first_div[0] # 获取第一个匹配的元素
first_div['style'] = 'width: 814px ; margin: 0 auto;' # 设置style属性
# # 设置style属性
# first_div['style'] = 'width: 814px ; margin: 0 auto;'
# print(first_div)
return soup
def aaa(dic_user_count):
for i in range(3):
if "A" in dic_user_count:
dic_user_count["A"] += 1
else:
dic_user_count["A"] = 1
if __name__ == "__main__": if __name__ == "__main__":
test1() # html = """<div class="rich_media_content js_underline_content autoTypeSetting24psection" id="js_content" style="width: 814px ; margin: 0 auto;"><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">3月31日,中国十七冶建安分公司承建的黄石EOD项目大泉路生态廊道工程马鞍山路高架桥顺利通车。<br/></span></section><section style="white-space: normal;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;text-indent: 0em;"><img class="rich_pages wxw-img" data-backh="312" data-backw="578" data-imgfileid="100024978" data-ratio="0.539568345323741" data-src="https://mmbiz.qpic.cn/mmbiz_png/ibEX3YMicPu80BnkeVDWh45k2S5saQQqDfvKJfBiblZO6OjbyWrYSJ3c2fib2eQReXQSLMONFicRD0fT2OdFY4da0og/640?wx_fmt=png&amp;from=appmsg" data-type="png" data-w="695" style="border: none;width: 100%;height: auto;" title=""/></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">马鞍山路高架桥全长315米,设计速度为60km/h,采用双向四车道。该桥横跨马鞍山路,建成后将缓解马鞍山路和大泉路交叉口的交通压力,对减轻市民们的出行焦虑有着重要意义,同时将进一步加强大冶、阳新与黄石城区的联系,为黄石更好融入“武鄂黄黄”都市圈奠定基础。</span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">为全力推动马鞍山路高架桥建成,项目管理团队锚定节点目标不动摇,倒排工期,通过精心组织,优化施工方案,主动出击,联合行政主管部门多次召开推进会,克服施工过程中的管线迁改难题,为马鞍山路高架桥顺利通车提供坚实组织保障。</span></section><section style="white-space: normal;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;text-indent: 0em;"><img class="rich_pages wxw-img" data-backh="374" data-backw="578" data-imgfileid="100024979" data-ratio="0.6474820143884892" data-src="https://mmbiz.qpic.cn/mmbiz_png/ibEX3YMicPu80BnkeVDWh45k2S5saQQqDfZBWgkUaG20wMwKDxZ4SjjFwZg0bvicuAcLPUia6ECktibM9KPj5NIrmicg/640?wx_fmt=png&amp;from=appmsg" data-type="png" data-w="695" style="border: none;width: 100%;height: auto;" title=""/></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;">项目部将牢牢把握“十七冶发展要义”,深入践行“24小时工作法”,全力打造精品工程、安全工程、民生工程,为黄石打造武汉都市圈贡献十七冶力量。</span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;"></span></section><section style="white-space: normal;text-indent: 2em;margin-top: 16px;margin-bottom: 16px;line-height: 1.75em;"><span style="outline: 0px;color: rgb(51, 51, 51);letter-spacing: 0.544px;text-indent: 2em;"></span></section><section style="margin-top: 16px;margin-bottom: 16px;white-space: normal;text-indent: 2em;line-height: 1.75em;"><section data-id="103115" data-role="splitline" data-tools="135编辑器" style="outline: 0px;font-variant-ligatures: normal;letter-spacing: 0.544px;orphans: 2;widows: 2;background-color: rgb(255, 255, 255);color: rgb(34, 34, 34);font-size: 16px;font-family: 微软雅黑;text-indent: 32px;overflow-wrap: break-word !important;"><section style="margin: 16px auto;outline: 0px;text-align: center;text-indent: 2em;line-height: 1.75em;overflow-wrap: break-word !important;"><section style="outline: 0px;display: flex;justify-content: center;align-items: center;overflow-wrap: break-word !important;"><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"><br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"> <br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 2px;outline: 0px;height: 6px;width: 6px;border-radius: 100%;border-width: 1px;border-style: solid;border-color: rgb(35, 35, 35);overflow: hidden;overflow-wrap: break-word !important;"><br style="outline: 0px;overflow-wrap: break-word !important;"/></section><section style="margin-left: 4px;outline: 0px;height: 1px;background-color: rgb(35, 35, 35);flex: 1 1 0%;overflow: hidden;overflow-wrap: break-word !important;"><br data-filtered="filtered" style="outline: 0px;overflow-wrap: break-word !important;"/></section></section></section></section><section style="margin: 16px 8px;outline: 0px;font-variant-ligatures: normal;letter-spacing: 0.544px;orphans: 2;text-indent: 0em;widows: 2;background-color: rgb(255, 255, 255);color: rgb(34, 34, 34);font-family: 微软雅黑;text-align: left;line-height: 1.75em;overflow-wrap: break-word !important;"><span style="outline: 0px;font-size: 14px;color: rgb(136, 136, 136);overflow-wrap: break-word !important;">来源:中国企业网</span><img class="rich_pages wxw-img __bg_gif" data-backh="192" data-backw="320" data-fileid="100002618" data-galleryid="" data-imgfileid="100024527" data-ratio="0.6" data-src="https://mmbiz.qpic.cn/mmbiz_gif/ibEX3YMicPu82TZf4RScpazSD7OuViaH4cEUx9rCibPavn2cJXiagJrmVuVTpOJgibBV8368H2RYxxYp3Fhn1a7SU20Q/640?wx_fmt=gif" data-type="gif" data-w="1000" style="letter-spacing: 0.544px;text-indent: 0em;outline: 0px;color: rgb(136, 136, 136);font-size: 15px;text-align: center;width: 562px;visibility: visible !important;height: auto !important;" width="320px"/></section></section><p style="display: none;"><mp-style-type data-value="3"></mp-style-type></p></div>"""
# soup = BeautifulSoup(html, 'html.parser')
# soup = rm_style_attr(soup)
# print(soup)
dic_user_count = {}
aaa(dic_user_count)
if dic_user_count:
for key, value in dic_user_count.items():
print(f"====账号{key},采集公众号个数{value}")
...@@ -10,8 +10,8 @@ import urllib3 ...@@ -10,8 +10,8 @@ import urllib3
from pymysql.converters import escape_string from pymysql.converters import escape_string
import sys import sys
sys.path.append('D:\\zzsn_spider\\base') # sys.path.append('D:\\zzsn\\base')
import BaseCore from base import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
...@@ -184,7 +184,7 @@ def getToken(): ...@@ -184,7 +184,7 @@ def getToken():
return row[0] return row[0]
# 获取列表数据 # 获取列表数据 每一页换一次公众号
def getPageData(dic_url, page, dic_user_count): def getPageData(dic_url, page, dic_user_count):
url_ = dic_url['url_'] url_ = dic_url['url_']
origin = dic_url['name'] origin = dic_url['name']
...@@ -206,7 +206,6 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -206,7 +206,6 @@ def getPageData(dic_url, page, dic_user_count):
user_name = tokenAndCookie[2] user_name = tokenAndCookie[2]
token = tokenAndCookie[0] token = tokenAndCookie[0]
log.info(f"获取token到----{token}----{user_name}") log.info(f"获取token到----{token}----{user_name}")
dic_user_count[user_name] = 0
cookies = json.loads(tokenAndCookie[1]) cookies = json.loads(tokenAndCookie[1])
# s.cookies.update(cookies) # s.cookies.update(cookies)
...@@ -223,18 +222,22 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -223,18 +222,22 @@ def getPageData(dic_url, page, dic_user_count):
str_t = json.dumps(json_search) str_t = json.dumps(json_search)
ret = json_search['base_resp']['ret'] ret = json_search['base_resp']['ret']
if ret == 0: if ret == 0:
dic_user_count[user_name] += 1 # 使用一次就记录一次
pass if user_name in dic_user_count:
dic_user_count[user_name] += 1
else:
dic_user_count[user_name] = 1
elif ret == 200013: elif ret == 200013:
log.info(f'======{origin}-----{biz}----{user_name}账号被封=======') log.info(f'======{origin}-----{biz}----{user_name}账号被封=======')
# 封号修改token # 封号修改token
updateTokeen(token, 1) updateTokeen(token, 1)
return getPageData(dic_url, page, dic_user_count) return getPageData(dic_url, page, dic_user_count), dic_user_count
elif ret == 200002: elif ret == 200002:
log.info(f'======{origin}-----{biz}----该公众号号biz错误,请检查=======') log.info(f'======{origin}-----{biz}----该公众号号biz错误,请检查=======')
error = [origin, url_, info_source_code, str_t, '无效biz参数'] error = [origin, url_, info_source_code, str_t, '无效biz参数']
insertBadSql(error) insertBadSql(error)
return True return True, dic_user_count
elif ret == 200003: elif ret == 200003:
log.info(f'======{origin}-----{biz}----{user_name}账号无效session=======') log.info(f'======{origin}-----{biz}----{user_name}账号无效session=======')
# session失效修改token # session失效修改token
...@@ -255,7 +258,7 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -255,7 +258,7 @@ def getPageData(dic_url, page, dic_user_count):
error = [origin, url_, info_source_code, str_t, '其他错误'] error = [origin, url_, info_source_code, str_t, '其他错误']
insertBadSql(error) insertBadSql(error)
updateTokeen(token, 2) updateTokeen(token, 2)
return True return True, dic_user_count
# 修改token使用时间 # 修改token使用时间
updateTokeen(token, 3) updateTokeen(token, 3)
# 保存数据到数据库 # 保存数据到数据库
...@@ -263,7 +266,7 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -263,7 +266,7 @@ def getPageData(dic_url, page, dic_user_count):
# 获取微信公众号数据 # 获取微信公众号数据
def getWxList(infoSourceCode): def getWxList(infoSourceCode, dic_user_count):
dic_url = getSourceInfo(infoSourceCode) dic_url = getSourceInfo(infoSourceCode)
log.info(f"======{infoSourceCode}----开始采集=======") log.info(f"======{infoSourceCode}----开始采集=======")
...@@ -276,7 +279,7 @@ def getWxList(infoSourceCode): ...@@ -276,7 +279,7 @@ def getWxList(infoSourceCode):
return return
origin = dic_url['name'] origin = dic_url['name']
biz = dic_url['biz'] biz = dic_url['biz']
dic_user_count = {}
for page in range(1, 6): for page in range(1, 6):
retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count) retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
time.sleep(random.randint(60, 181)) time.sleep(random.randint(60, 181))
...@@ -286,8 +289,6 @@ def getWxList(infoSourceCode): ...@@ -286,8 +289,6 @@ def getWxList(infoSourceCode):
else: else:
# 没有结束 # 没有结束
pass pass
for key, value in dic_user_count.items():
log.info(f"====账号{key},采集公众号个数{value}")
log.info(f"======{origin}-----{biz}----结束采集=======") log.info(f"======{origin}-----{biz}----结束采集=======")
...@@ -310,17 +311,26 @@ def getnumber_redis(): ...@@ -310,17 +311,26 @@ def getnumber_redis():
if __name__ == "__main__": if __name__ == "__main__":
# getFromSql()
numbers = getnumber_redis() numbers = getnumber_redis()
log.info("当前批次采集公众号个数{}".format(numbers)) log.info("当前批次采集公众号个数{}".format(numbers))
time.sleep(3) time.sleep(3)
dic_user_count = {}
# dic_user_count = {
# 'name': '',
# 'use_count': 0,
# 'gzh_count': 0
# }
while True: while True:
start = time.time() start = time.time()
log.info(f"开始时间{baseCore.getNowTime(1)}") log.info(f"开始时间{baseCore.getNowTime(1)}")
infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode') infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
# infoSourceCode = 'IN-20220609-57899'
if infoSourceCode == 'None' or infoSourceCode == None: if infoSourceCode == 'None' or infoSourceCode == None:
log.info("redis已经没有数据了,重新放置数据") log.info("redis已经没有数据了,重新放置数据")
log.info(f"采集完一轮公众号耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"采集完一轮公众号耗时{baseCore.getTimeCost(start, time.time())}")
# getFromSql()
# time.sleep(60) # time.sleep(60)
# numbers = getnumber_redis() # numbers = getnumber_redis()
# log.info("当前批次采集公众号个数{}".format(numbers)) # log.info("当前批次采集公众号个数{}".format(numbers))
...@@ -328,7 +338,10 @@ if __name__ == "__main__": ...@@ -328,7 +338,10 @@ if __name__ == "__main__":
# infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode') # infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
continue continue
getWxList(infoSourceCode) getWxList(infoSourceCode, dic_user_count)
if dic_user_count:
for key, value in dic_user_count.items():
log.info(f"====账号{key},使用次数{value}")
# break
# infoSourceCode = 'IN-20220917-0159' # infoSourceCode = 'IN-20220917-0159'
# getWxList(infoSourceCode) # getWxList(infoSourceCode)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论