提交 6fc56b71 作者: XveLingKun

元搜索修改

上级 2c80eba5
#coding=utf-8 #coding=utf-8
...@@ -420,7 +420,7 @@ class BaiduSpider(object): ...@@ -420,7 +420,7 @@ class BaiduSpider(object):
lang=self.detect_language(title) lang=self.detect_language(title)
raw_html=self.webDriver(url) raw_html=self.webDriver(url)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html) article=sm.extract_by_html(raw_html, title)
#todo:标题获取全部标题 #todo:标题获取全部标题
title=article.title title=article.title
content=article.cleaned_text content=article.cleaned_text
...@@ -429,7 +429,7 @@ class BaiduSpider(object): ...@@ -429,7 +429,7 @@ class BaiduSpider(object):
try: try:
raw_html=self.webDriver(url) raw_html=self.webDriver(url)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
article=sm.extract_by_html(raw_html) article=sm.extract_by_html(raw_html, title)
# todo:标题获取全部标题 # todo:标题获取全部标题
title = article.title title = article.title
content=article.cleaned_text content=article.cleaned_text
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -129,8 +129,31 @@ class SmartExtractor: ...@@ -129,8 +129,31 @@ class SmartExtractor:
""" """
按HTML采集内容 按HTML采集内容
""" """
#todo 1018去除head标签 将textarea标签替换成div标签
from bs4 import BeautifulSoup
html_ = BeautifulSoup(html, 'html.parser')
head_tag = html_.find('head')
if head_tag:
head_tag.decompose()
aside_tag = html_.find('aside')
if aside_tag:
aside_tag.decompose()
textarea_tags = html_.find_all('textarea')
if textarea_tags:
for textarea in textarea_tags:
try:
# 创建一个新的 <div> 标签
div_tag = html_.new_tag('div')
# 将 <textarea> 的内容移动到新的 <div> 标签中
div_tag.string = textarea.string
# 替换 <textarea> 标签
textarea.replace_with(div_tag)
except Exception as e:
continue
# 采集正文:传入html # 采集正文:传入html
article = self.goose.extract(raw_html=html) article = self.goose.extract(raw_html=str(html_)) # str(html_)
return self.get_extraction_result(article, link_text) return self.get_extraction_result(article, link_text)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -91,6 +91,7 @@ class SmartExtractorUtility: ...@@ -91,6 +91,7 @@ class SmartExtractorUtility:
'span', 'span',
'td', 'td',
'p', 'p',
'title'
] ]
# 对比标题前,统一将空格剔除(2022-09-21): # 对比标题前,统一将空格剔除(2022-09-21):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论