提交 6fc56b71 作者: XveLingKun

元搜索修改

上级 2c80eba5
#coding=utf-8
#coding=utf-8
......@@ -420,7 +420,7 @@ class BaiduSpider(object):
lang=self.detect_language(title)
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
article=sm.extract_by_html(raw_html, title)
#todo:标题获取全部标题
title=article.title
content=article.cleaned_text
......@@ -429,7 +429,7 @@ class BaiduSpider(object):
try:
raw_html=self.webDriver(url)
sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html)
article=sm.extract_by_html(raw_html, title)
# todo:标题获取全部标题
title = article.title
content=article.cleaned_text
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -129,8 +129,31 @@ class SmartExtractor:
"""
按HTML采集内容
"""
#todo 1018去除head标签 将textarea标签替换成div标签
from bs4 import BeautifulSoup
html_ = BeautifulSoup(html, 'html.parser')
head_tag = html_.find('head')
if head_tag:
head_tag.decompose()
aside_tag = html_.find('aside')
if aside_tag:
aside_tag.decompose()
textarea_tags = html_.find_all('textarea')
if textarea_tags:
for textarea in textarea_tags:
try:
# 创建一个新的 <div> 标签
div_tag = html_.new_tag('div')
# 将 <textarea> 的内容移动到新的 <div> 标签中
div_tag.string = textarea.string
# 替换 <textarea> 标签
textarea.replace_with(div_tag)
except Exception as e:
continue
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
article = self.goose.extract(raw_html=str(html_)) # str(html_)
return self.get_extraction_result(article, link_text)
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -91,6 +91,7 @@ class SmartExtractorUtility:
'span',
'td',
'p',
'title'
]
# 对比标题前,统一将空格剔除(2022-09-21):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论