元搜索修改

6fc56b71 · XveLingKun · 2c80eba5 · 6fc56b71 · 6fc56b71 · 6fc56b71
--- a/百度采集/baidu_comm/baiduSpider.py
+++ b/百度采集/baidu_comm/baiduSpider.py
-#coding=utf-8
+#coding=utf-8
@@ -420,7 +420,7 @@ class BaiduSpider(object):
            lang=self.detect_language(title)
            raw_html=self.webDriver(url)
            sm=SmartExtractor(lang)
-            article=sm.extract_by_html(raw_html)
+            article=sm.extract_by_html(raw_html, title)
            #todo:标题获取全部标题
            title=article.title
            content=article.cleaned_text
@@ -429,7 +429,7 @@ class BaiduSpider(object):
            try:
                raw_html=self.webDriver(url)
                sm=SmartExtractor(lang)
-                article=sm.extract_by_html(raw_html)
+                article=sm.extract_by_html(raw_html, title)
                # todo:标题获取全部标题
                title = article.title
                content=article.cleaned_text

--- a/百度采集/baidu_comm/smart_extractor.py
+++ b/百度采集/baidu_comm/smart_extractor.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -129,8 +129,31 @@ class SmartExtractor:
        """
        按HTML采集内容
        """
+        #todo 1018去除head标签  将textarea标签替换成div标签
+        from bs4 import BeautifulSoup
+        html_ = BeautifulSoup(html, 'html.parser')
+        head_tag = html_.find('head')
+        if head_tag:
+            head_tag.decompose()
+        aside_tag = html_.find('aside')
+        if aside_tag:
+            aside_tag.decompose()
+        textarea_tags = html_.find_all('textarea')
+        if textarea_tags:
+            for textarea in textarea_tags:
+                try:
+                    # 创建一个新的 <div> 标签
+                    div_tag = html_.new_tag('div')
+
+                    # 将 <textarea> 的内容移动到新的 <div> 标签中
+                    div_tag.string = textarea.string
+
+                    # 替换 <textarea> 标签
+                    textarea.replace_with(div_tag)
+                except Exception as e:
+                    continue
        # 采集正文：传入html
-        article = self.goose.extract(raw_html=html)
+        article = self.goose.extract(raw_html=str(html_))   # str(html_)

        return self.get_extraction_result(article, link_text)


--- a/百度采集/baidu_comm/smart_extractor_utility.py
+++ b/百度采集/baidu_comm/smart_extractor_utility.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -91,6 +91,7 @@ class SmartExtractorUtility:
            'span',
            'td',
            'p',
+            'title'
        ]

        # 对比标题前，统一将空格剔除（2022-09-21）：