提交 6ce462aa 作者: XveLingKun

谷歌搜索

上级 9d49a0cd
# 获取详情页
# 获取详情页
import time
from bs4 import BeautifulSoup
def get_detail_html():
while True:
# todo:从redis中读取数据
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
try:
# try:
# self.driver.get(detailUrl)
# except Exception as e:
# self.driver.quit()
# self.driver=self.createDriver()
# self.driver.get(detailUrl)
# response = self.driver.page_source
try:
bdetail = getDetailmsg(detailmsg)
# 'content':content,
# 'contentHtml':contentWithTag,
content = bdetail['content']
contentHtml = bdetail['contentHtml']
if len(content) < 100:
continue
soup = BeautifulSoup(contentHtml, "html.parser")
# 查找所有带有class属性的元素
elements_with_class = soup.find_all(class_=True)
# 循环遍历元素并去掉class属性
for element in elements_with_class:
del element.attrs["class"]
contentHtml = str(soup)
bdetail['content'] = content
bdetail['contentHtml'] = contentHtml
except Exception as e:
logger.info('详情解析失败')
continue
processitem = getProcessitem(bdetail)
# uniqueCode = self.baseCore.getUniqueCode('GG', '195', self.threadId)
# processitem['uniqueCode'] = uniqueCode
try:
# flg = self.sendkafka(processitem)
flg = True
if flg:
r.sadd('pygoogle_' + 'google', processitem['sourceAddress'])
# 插入数据库
try:
items = []
items.apitemInsertToTablepend(bdetail)
(items)
except Exception as e:
logger.info(f"插入数据库失败!{bdetail['kword']}===={e}")
# self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
# self.sendMonitor(processitem)
except Exception as e:
logger.info(f"{e}{bdetail['kword']}===={detailUrl}")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
except Exception as e:
time.sleep(5)
logger.info("详情页解析异常!" + detailUrl)
else:
break
# time.sleep(5)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论