天眼查企业动态

75452674 · 丁双波 · f0fe0069 · 75452674 · 75452674 · 75452674
--- a/base/smart/__init__.py
+++ b/base/smart/__init__.py
--- a/base/smart/entity.py
+++ b/base/smart/entity.py
+# -*- coding: utf-8 -*-
+
+# 智能采集请求
+# 1、考虑：请求智能采集时，不再使用实体类
+#    a. 仍使用：通过HTTP的 raw 请求体，直接传递HTML源文件，通过query参数传递 lang-code、link-text 参数
+#    b. 原因：在 postman 中，不方便进行测试，无法使用粘贴后的HTML源文件
+# 2、不考虑：使用实体类，利大于弊
+#    a. 使用实体类，方便扩展参数字段
+#    b. 方便展示接口文档：调用 json_parameter_utility.get_json_parameters 函数，可显示请求实体类
+class ExtractionRequest:
+    # 语言代码
+    # 1、采集“非中文”的文章时，需要用到语言代码
+    lang_code = ""
+    # 链接文本
+    # 1、用于采集标题，如果不提供，标题的准确度会下降
+    link_text = ""
+    # 文章页面源文件
+    # 1、用于采集标题、发布时间、内容等
+    article_html = ""
+
+    @staticmethod
+    def from_dict(dictionary: dict):
+        extraction_request = ExtractionRequest()
+        # 尝试方法：
+        # 1、将字典，更新到内部的 __dict__ 对象
+        # extraction_request.__dict__.update(dictionary)
+        # 将字典值，设置到当前对象
+        for key in dictionary:
+            setattr(extraction_request, key, dictionary[key])
+
+        return extraction_request
+
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+
+        return data
+
+
+# 采集结果
+class ExtractionResult:
+    # 标题
+    title = ""
+    # 发布日期
+    publish_date = ""
+    # 正文（保留所有HTML标记，如：br、img）
+    text = ""
+    # URL
+    url = ""
+
+    # 摘要
+    meta_description = ""
+    # 干净正文（不带HTML）
+    cleaned_text = ""
+
+    # 来源（目前只支持采集中文网站中的“来源”）
+    # source = ""
+
+    # 顶部图片（top_image：采集不到任何内容，不再使用此属性）
+    # top_image = ""
+
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+
+        return data
+
+
+class UrlPickingRequest:
+    # 列表页面的响应URL
+    # 1、作为Base URL，用于拼接提取到的相对URL
+    # 2、Base URL：必须使用响应URL
+    # 3、示例：在 Python中，通过 requests.get(url) 请求URL后，需要使用 resp.url 作为 Base URL
+    list_page_resp_url = ""
+    # 列表页面源文件
+    # 1、用于提取文章网址
+    list_page_html = ""
+
+    @staticmethod
+    def from_dict(dictionary: dict):
+        url_picking_request = UrlPickingRequest()
+        # 将字典值，设置到当前对象
+        for key in dictionary:
+            setattr(url_picking_request, key, dictionary[key])
+
+        return url_picking_request
+
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+
+        return data
--- a/base/smart/smart_extractor.py
+++ b/base/smart/smart_extractor.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import requests
+from goose3 import Goose
+from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
+from base.smart.entity import *
+from base.smart.smart_extractor_utility import SmartExtractorUtility
+# goose3自带的lxml，提示找不到etree，但仍可使用
+from lxml import etree
+from lxml.html import HtmlElement
+
+
+class SmartExtractor:
+    @staticmethod
+    def get_supported_lang_code_dict():
+        """
+        支持语言：
+        1、需要分词，传递分词器（3种）：
+           a. 中文、韩语、阿拉伯语
+        2、不需要分词，直接传递语言编码（16种）
+           a. 其中英语、俄语，单独测试
+        """
+        supported_lang_code_dict = {
+            'cn': '中文',  # 中文
+            'zh-cn': '简体中文',  # 简体中文
+            'ko': '韩语',  # 韩语
+            'ar': '阿拉伯语',  # 阿拉伯语
+            'en': '英语',  # 英语
+            'ru': '俄语',  # 俄语
+            'da': '丹麦语',  # 丹麦语
+            'de': '德语',  # 德语
+            'es': '西班牙语',  # 西班牙语
+            'fi': '芬兰语',  # 芬兰语
+            'fr': '法语',  # 法语
+            'hu': '匈牙利语',  # 匈牙利语
+            'id': '印度尼西亚语',  # 印度尼西亚语
+            'it': '意大利语',  # 意大利语
+            'nb': '挪威语（伯克梅尔）',  # 挪威语（伯克梅尔）
+            'nl': '荷兰语',  # 荷兰语
+            'no': '挪威文（耐诺斯克）',  # 挪威文（耐诺斯克）
+            'pl': '波兰语',  # 波兰语
+            'pt': '葡萄牙语',  # 葡萄牙语
+            'sv': '瑞典语',  # 瑞典语
+        }
+
+        return supported_lang_code_dict
+
+    def __init__(self, lang_code='cn'):
+        """
+        构造器：未指定 lang_code 参数时，默认为 cn
+        """
+        # 支持语言
+        self.goose = Goose({'stopwords_class': StopWordsChinese})
+
+
+
+
+
+
+    def get_extraction_result(self, article, link_text=''):
+        """
+        获取采集结果：
+        1、从 artcile 对象中，采集数据并封装到 ExtractionResult
+        """
+        # 用于保存：采集后的文本
+        extraction_result = ExtractionResult()
+
+        # 标题
+        # extraction_result.title = article.title     # 原办法：使用 goose 采集到的 title 中的标题
+        extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
+        # 发布日期
+        extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
+        # 正文（保留所有HTML标记，如：br、img）
+        extraction_result.text = SmartExtractorUtility.get_article_text(article)
+        # URL
+        extraction_result.url = article.final_url
+
+        # 摘要
+        extraction_result.meta_description = article.meta_description
+        # 干净正文（不带HTML）
+        extraction_result.cleaned_text = article.cleaned_text
+        # 来源（目前只支持采集中文网站中的“来源”）
+        extraction_result.source = ''
+
+        return extraction_result
+
+
+
+    def getContentByUrl(self, url):
+        """
+        按URL采集内容
+        """
+        # 采集正文：传入url
+        article = self.goose.extract(url=url)
+        return article.cleaned_text
+
+    def extract_by_url(self, url, link_text=''):
+        """
+        按URL采集内容
+        """
+        # 采集正文：传入url
+        article = self.goose.extract(url=url)
+        # article = goose.extract(raw_html=html)
+
+        return self.get_extraction_result(article, link_text)
+
+#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
+def extract_by_url_test(url_list,list_info_all):
+    # 测试：按URL采集
+    # url_list = [
+    #     # "http://www.news.cn/politics/2022-07/31/c_1128879636.htm",  # 短文本
+    #     # "https://baijiahao.baidu.com/s?id=1741311527693101670",  # 带多张图片
+    #     # "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml",  # 带多张图片，及一个视频（测试内容XPath失败）
+    #     # "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html",  # 人民网
+    #     # 韩文：中央日报-politics
+    #     # "https://www.joongang.co.kr/article/25094974",
+    #     # "https://www.joongang.co.kr/article/25094967",
+    #     # 英文：加德满都邮报-national-security
+    #     # "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
+    #     # "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders",  # 测试采集：发布时间
+    #     # 俄语：今日白俄罗斯报-word
+    #     # "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
+    #     # 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
+    #     # 阿语
+    #     # "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
+    #     # "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
+    #     # 测试提取标题
+    #     # "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
+    #     # "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
+    #     # "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
+    #     # 'http://www.crfeb.com.cn/1j/_124/2005409/index.html',   # 内容采集失败
+    #     # 'http://www.crfeb.com.cn/1j/_124/912248/index.html',  # 内容采集失败
+    #     # 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html',  # 中国铁建股份有限公司-工作动态（日期采集错误）
+    #     # 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html',  # 中国土木工程集团有限公司-多个栏目（日期采集错误）
+    #     # 'http://v.people.cn/n1/2022/0901/c444662-32517559.html',    # 人民网视频：title必须以“元素中的标题”开始，不能判断“包含”
+    #     # 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻（标题采集失败）
+    #     # 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻（标题采集失败）
+    #     # 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html',  # 中国路桥工程有限责任公司-多个栏目（标题采集失败）
+    #     # 'http://www.cgcoc.com.cn/news/432.html',  # 中地海外集团有限公司-新闻中心（标题和内容采集失败）
+    #     # 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html'  # 中国五矿（测试：正文采集失败）
+    #     # 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html',  # 中国电力建设集团（测试：标题、正文采集失败）
+    #     # 中国电力建设集团（测试：标题采集失败），相比列表中的链接文本、title标签中的内容，元素中的标题，“秉承丝路精髓  抒写锦绣华章”中间多出一个空格
+    #     # 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html',  # 标题采集失败：看着没有问题
+    #     # 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html',  # 中国建筑股份有限公司-企业动态：日期采集错误，采集到当天日期
+    #     'https://3g.k.sohu.com/t/n705260979'    #天眼查--企业公告'
+    # ]
+
+    # 语言编码
+    lang_code = 'cn'
+    # lang_code = 'ko'
+    # lang_code = 'en'
+    # lang_code = 'ru'
+    # lang_code = 'ar'
+
+    #url = ["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418']
+    for url in url_list:
+        #当前企业公告中的一条信息
+        print()
+        print("-" * 100)
+        print('请求URL：', url[1])
+        extraction_result = SmartExtractor(lang_code).extract_by_url(url[1])
+
+        # 测试转换为JSON
+        # 1、直接转换时，会抛异常：TypeError: Object of type ExtractionResult is not JSON serializable
+        # print(json.dumps(extraction_result))
+        # print(json.dumps(extraction_result, default=ExtractionResult.to_dict))    # 转换成功：指定序列化器
+        # print(type(json.dumps(extraction_result.to_dict())))  # 返回类型：<class 'str'>，内容中的中文会被转义
+        # print(str(extraction_result.to_dict()))     # 如果直接转换为字符串，中文不会被转义
+        # 当前企业数据拿完
+        list_info = [
+            extraction_result.title,
+            extraction_result.publish_date,
+            extraction_result.cleaned_text,
+            extraction_result.url,
+            url[0],
+            url[2],
+            '天眼查',
+        ]
+    return list_info
--- a/base/smart/smart_extractor_utility.py
+++ b/base/smart/smart_extractor_utility.py
+import re
+from goose3.article import Article
+from lxml import etree
+from lxml.html import HtmlElement
+
+
+class SmartExtractorUtility:
+    # 标题最小长度
+    title_min_len = 6
+
+    @staticmethod
+    def extract_publish_date(html):
+        pattern_list = [
+            # 2010-10-1 8:00:00
+            r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
+            # 2010-10-1 8:00
+            r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
+            # 2010年10月1日 8:00:00
+            r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
+            # 2010年10月1日 8:00
+            r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
+            # 2010/10/1 8:00:00
+            r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
+            # 2010/10/1 8:00
+            r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
+            # 2010-10-1
+            r"20\d{2}-\d{1,2}-\d{1,2}",
+            # 2010年10月1日
+            r"20\d{2}年\d{1,2}月\d{1,2}日",
+            # 2010/10/1
+            r"20\d{2}/\d{1,2}/\d{1,2}",
+            # 2022.08.28
+            r"20\d{2}\.\d{1,2}\.\d{1,2}"
+            # 12-07-02 10:10
+            r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
+            # 1月前
+            r"\d+(&nbsp;| )*月前",
+            # 12天前
+            r"\d+(&nbsp;| )*天前",
+            # 2小时前
+            r"\d+(&nbsp;| )*小时前",
+            # 15分钟前
+            r"\d+(&nbsp;| )*分钟前",
+            # 昨天&nbsp;17:59
+            r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
+        ]
+
+        # 尝试匹配所有正则式
+        for pattern in pattern_list:
+            # 提取可见日期：
+            # 1、必须在标签内部，不能提取HTML标签属性中的日期
+            # 2、提取规则：必须在 > 和 < 之间，且中间不能再有 >
+            tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
+            # 搜索第一个匹配项
+            match = re.search(tag_pattern, html)
+            # 如果匹配成功，返回正确的发布时间
+            if match:
+                return match.group('date')
+
+        # 所有正则式匹配失败，返回空字符串
+        return ""
+
+    @staticmethod
+    def add_html_br(cleaned_text):
+        # 包装HTML标记：换行
+        # 1、优先替换双换行：使用goose提取到的cleaned_text，都是双换行
+        cleaned_text = cleaned_text.replace("\n\n", "<br>")
+        cleaned_text = cleaned_text.replace("\n", "<br>")
+        return cleaned_text
+
+    @staticmethod
+    def get_article_title(article: Article, link_text=''):
+        #
+        # 优先提取h1、div、span、td元素中的标题
+        # 1、测试任务：2.智能采集\1.测试任务\国资委-新闻发布
+        #    a. 原title标题：中国能建：聚焦价值创造 打造国企改革发展“红色引擎”－国务院国有资产监督管理委员会
+        #    b. div元素中的标题：中国能建：聚焦价值创造 打造国企改革发展“红色引擎”
+        # 2、测试任务：2.智能采集\1.测试任务\国家林业和草原局-地方动态
+        #    a. 原title标题：上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
+        #    b. span元素中的标题：上海完成森林资源年度监测遥感解译图斑市级质量检查
+        #
+        # 根据xpath，查询标题元素时：
+        # 1、标签优先级：h1、特殊元素（id或class包含title）、h2、h3、div、span、td
+        #
+        title_element_list = [
+            'h1',
+            'h2',
+            'h3',
+            'div',
+            'span',
+            'td',
+            'p',
+        ]
+
+        # 对比标题前，统一将空格剔除（2022-09-21）：
+        # 1、测试任务：3.马荣：一带一路，配置不成功\中国电力建设集团（测试：标题采集失败）
+        # 2、相比列表中的链接文本、title标签中的内容，元素中的标题，“秉承丝路精髓  抒写锦绣华章”中间多出一个空格
+        link_text = link_text.replace(" ", "")
+        tag_title = article.title.replace(" ", "")
+
+        title = None
+        for title_element in title_element_list:
+            element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
+            # 查询XPath成功，遍历所有元素
+            for element in element_list:
+                # 取纯文本内容，包括子元素
+                text = etree.tounicode(element, method='text').strip()
+                text_no_space = text.replace(" ", "")
+                # 判断标题：
+                # 1、如果智能采集的原title标题，以“元素内容”开头，则取元素内容
+                # 2、查找成功后，返回text作为标题，否则继续下一个循环
+                # 判断是否以“元素中的标题”开始：
+                # 1、title必须以“元素中的标题”开始，不能判断“包含”
+                # 2、测试URL：http://v.people.cn/n1/2022/0901/c444662-32517559.html
+                # 3、title标签：<title>亿缕阳光丨小生意，大格局--人民视频--人民网</title>
+                #    a. 如果判断“包含”，会采集到：人民网
+                #    b. 因为存在元素：<a href="http://www.people.com.cn/" class="clink">人民网</a>
+                #    c. 如果判断以“元素中的标题”开始，采集到：亿缕阳光丨小生意，大格局
+                #    d. 标题元素：<h2>亿缕阳光丨小生意，大格局</h2>
+                # 新方案：
+                # 1、对比常用元素：仍判断是否以“元素中的标题”开始
+                # 2、优先对比“链接文本”，其次对比“title元素”
+                # 3、满足最少字数：6个字
+                # 新方案（2022-09-21）：
+                # 1、对比“链接文本”、“title元素”时，除了判断开始，同时允许结尾
+                # 2、测试任务：3.马荣：一带一路，配置不成功\中国电力建设集团（测试：标题采集失败）
+                #    a. 列表中的链接文本：【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
+                #    b. title标签中的内容：<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
+                #    c. 元素中的标题：【“一带一路”旗舰篇】秉承丝路精髓  抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
+                if text_no_space is not None and text_no_space != '' and len(
+                        text_no_space) >= SmartExtractorUtility.title_min_len:
+                    # 优先判断6个字，以方便调试：排除短文本元素
+                    if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
+                            text_no_space) or tag_title.endswith(text_no_space):
+                        # 返回时，仍返回未剔除空格后的标题
+                        return text
+
+        if title:
+            # 查找成功，返回元素中的标题
+            return title
+        else:
+            # 查找失败，返回提取到的title属性
+            # return article.title
+            # 新考虑：标题采集失败后，返回空值
+            # 1、原因：article.title 不可靠，只是提取了 title 标签中的内容
+            return ''
+
+    @staticmethod
+    def get_publish_date(article: Article):
+        # 优先使用正则式提取日期
+        # 1、测试任务：加德满都邮报-national-security
+        #    a. 使用 publish_datetime_utc 提取英文日期后，提取错误
+        #    b. 实际日期：Friday, August 19, 2022，但提取到了：2015-02-05
+        #    c. 原因：在下方JS中，有一段JSON文本： "datePublished": "2015-02-05T08:00:00+08:00"
+        # 2、注意：中文网站，都必须使用正则式
+        publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
+        if publish_date != '':
+            return publish_date
+        else:
+            if article.publish_datetime_utc:
+                # 优先使用提取成功的 datetime
+                return article.publish_datetime_utc.strftime('%Y-%m-%d')
+            elif article.publish_date:
+                # 其次使用提取成功的 date 字符串
+                return article.publish_date
+            else:
+                # 全部提取失败，返回字符串
+                return ''
+
+    @staticmethod
+    def get_article_text(article: Article):
+        # 第一种方法：在纯文本（cleaned_text）基础上，添加br标签
+        # 1、缺点：无法获取图片，同时会丢掉原有的p标签（只能用br替补）
+        # text = SmartExtractor.add_html_br(article.cleaned_text)
+
+        # 第二种方法：直接获取 top_node 的HTML内容
+        # 1、优点：可保留原有的p标签等
+        # 2、缺点：无法获取图片，img标签未被保留
+        # text = etree.tounicode(article.top_node, method='html')
+
+        # 测试抛出异常
+        # raise Exception("测试抛出异常")
+
+        # 第三种方法：获取到 top_node 的xpath，再通过xpath查询原始doc
+        # 1、可行：通过查询原始doc，可以获取“正文”的所有HTML内容
+        # 2、遇到问题：获取到 top_node 的xpath不准确，与原位置偏移一个元素
+        #    a. 测试URL：https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
+        #    b. 获取到的xpath：/html/body/div/div[1]/div[2]/div[4]
+        #    c. 实际xpath：/html/body/div/div[1]/div[2]/div[5]
+        # 3、解决办法：
+        #    a. 优先使用id、class查询，如果没有id、class，再查询 top_node 的xpath
+        xpath = None
+        if type(article.top_node) is HtmlElement:
+            if 'id' in article.top_node.attrib:
+                xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
+            elif 'class' in article.top_node.attrib:
+                xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
+            else:
+                xpath = article.top_node.getroottree().getpath(article.top_node)
+        else:
+            # article.top_node 有时为空：
+            # 1、测试URL：https://baijiahao.baidu.com/s?id=1741311527693101670
+            # 2、输出日志：article.top_node 不是 HtmlElement 对象：None
+            print("SmartExtractor：article.top_node 为 {}，不是 HtmlElement 对象。".format(article.top_node))
+
+            # article.top_node 为空时，直接输出 cleaned_text：
+            # 1、在纯文本（cleaned_text）基础上，添加br标签
+            text = SmartExtractorUtility.add_html_br(article.cleaned_text)
+            return text
+
+        # 根据xpath，查询元素
+        element_list = article.raw_doc.getroottree().xpath(xpath)
+        if element_list:
+            # 查询XPath成功，获取第一个元素的HTML
+            text = etree.tounicode(element_list[0], method='html')
+        else:
+            # 查询XPath失败，返回 top_node 原有的HTML
+            # 1、缺点：无法获取图片，img标签未被保留
+            text = etree.tounicode(article.top_node, method='html')
+
+        return text
--- a/comData/__init__.py
+++ b/comData/__init__.py
--- a/comData/reademe.txt
+++ b/comData/reademe.txt
+企业数据采集
+企业数据采集相关脚本都放到此目录
--- a/comData/tcyQydt/__init__.py
+++ b/comData/tcyQydt/__init__.py
--- a/comData/tcyQydt/getTycId.py
+++ b/comData/tcyQydt/getTycId.py
+# 根据信用代码获取天眼查id
+import json
+import random
+import time
+
+import pymysql
+import requests
+
+from base.BaseCore import BaseCore
+
+baseCore = BaseCore()
+log = baseCore.getLogger()
+headers={
+    'X-AUTH-TOKEN':'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzY4MzgxNjk4NCIsImlhdCI6MTY5MDE3ODYyOCwiZXhwIjoxNjkyNzcwNjI4fQ.VV3Zoa4RM5nVN8UXBc0-81KMGqLzTOme6rButeETGfFQi7p5h4ydg8CFrEsizr_iFwB3_BVaKR2o2xR-M4ipbQ',
+    'X-TYCID':'77e997401d5f11ee9e91d5a0fd3c0b83',
+    'version':'TYC-Web',
+    'Content-Type':'application/json;charset=UTF-8'
+}
+cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+cursor= cnx.cursor()
+
+#根据信用代码获取天眼查id 企业名字等信息
+def getTycIdByXYDM(xydm):
+    retData={'state':False,'tycData':None}
+    url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
+    ip = baseCore.get_proxy()
+    paramJsonData = {'keyword':xydm}
+    try:
+        headers['User-Agent'] = baseCore.getRandomUserAgent()
+        response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
+        time.sleep(random.randint(3, 5))
+        retJsonData =json.loads(response.content.decode('utf-8'))
+        matchType=retJsonData['data'][0]['matchType']
+        if matchType=='信用代码匹配':
+            retData['state'] = True
+            retData['tycData'] = retJsonData['data'][0]
+            return retData
+        else:
+            log.error(f"{xydm}------{retJsonData}")
+            return retData
+    except Exception as e:
+        log.error(f"{xydm}---exception---{e}")
+        return retData
+# 更新天眼查企业基本信息
+def updateTycInfo(id,retData):
+    state= retData['state']
+    if state :
+        data = retData['tycData']
+        updateSql=f"update tyc_com_info set state=3,update_date=now(),tycid='{data['id']}', type='{data['type']}',comName='{data['comName']}',name='{data['name']}'" \
+              f",alias='{data['alias']}',logo='{data['logo']}',claimLevel='{data['claimLevel']}',regStatus='{data['regStatus']}' where id={id}"
+    else:
+        updateSql = f"update tyc_com_info set state=4,update_date=now()  where id={id}"
+    cursor.execute(updateSql)
+    cnx.commit()
+
+#采集工作
+def beginWork():
+    while True :
+        start =time.time()
+        selectSql="select id,xydm from tyc_com_info where state=1 order by update_date asc ,id asc limit 1"
+        cursor.execute(selectSql)
+        data = cursor.fetchone()
+        if data:
+            pass
+        else:
+            log.info("没有数据了，结束脚本")
+            break
+        data_list = list(data)
+        id = data_list[0]
+        xydm = data_list[1]
+        retData = getTycIdByXYDM(xydm)
+        updateTycInfo(id,retData)
+        log.info(f"{id}---{xydm}----处理完成，耗时：{baseCore.getTimeCost(start,time.time())}")
+    cursor.close()
+    cnx.close()
+
+if __name__ == '__main__':
+    beginWork()
\ No newline at end of file
--- a/comData/tcyQydt/reademe.txt
+++ b/comData/tcyQydt/reademe.txt
+天眼查企业动态脚本
+start.bat 双击启动脚本
+tyc_qydt.py 脚本是获取天眼查企业动态的脚本
+
+脚本逻辑
+1：读取待采集企业的天眼查id
+select id,xydm,tycid from ssqy_tyc where state=1  order by date_time asc  limit 1
+2：根据天眼查id访问新闻列表接口，查询第一页
+url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
+3：获取总数，计算分页数
+ if (total > 0):
+        if (total % pageSize == 0):
+            totalPage = total // pageSize
+        else:
+            totalPage = total // pageSize + 1
+4：循环分页数，一次拿一页数据
+ for num in range(1, totalPage+1):
+5：循环每页数据，判断库中该企业是否存在对应新闻
+sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
+如果不存在，使用智能抽取获取正文
+    contentText = smart.extract_by_url(link).text
+    采集成功
+        保存新闻到库中
+        insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+    采集失败
+        保存到失败列表中
+        insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
+
+如果存在
+跳过
+6：更新企业状态、采集成功数、总数、失败数等信息
+updateEndSql = f"update ssqy_tyc set state={stateNum},total={total},okCount={okCount},errorCount={errorCount},repetCount={repetCount} ,date_time=now() where id={id}"
+
+
--- a/comData/tcyQydt/start.bat
+++ b/comData/tcyQydt/start.bat
+title qydt
+call activate
+call conda activate luyuen@310
+python tyc_qydt.py
+pause
\ No newline at end of file
--- a/comData/tcyQydt/test.py
+++ b/comData/tcyQydt/test.py
+from base.smart import smart_extractor
+
+smart =smart_extractor.SmartExtractor('cn')
+url="https://blog.csdn.net/qq_41767116/article/details/119988991"
+contentText = smart.extract_by_url(url).text
+print(contentText)
\ No newline at end of file
--- a/comData/tcyQydt/tyc_qydt.py
+++ b/comData/tcyQydt/tyc_qydt.py
+"""
+"""
+import json
+import requests,time,pymysql 
+import jieba
+
+from base.BaseCore import BaseCore
+from base.smart import smart_extractor
+
+jieba.cut("必须加载jieba")
+# 初始化，设置中文分词
+smart =smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+cursor= cnx.cursor()
+pageSize = 10
+headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=77e997401d5f11ee9e91d5a0fd3c0b89; ssuid=6450041974; _ga=GA1.2.858826166.1688800641; _gid=GA1.2.2142449376.1689575510; tyc-user-info-save-time=1689764135027; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22309757777%22%2C%22first_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzQ1Y2IxMDI1N2QtMGNmZWUwNTMyN2Y2NzMtMjYwMzFkNTEtMTMyNzEwNC0xODkzNDVjYjEwMzc1YiIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwOTc1Nzc3NyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22309757777%22%7D%2C%22%24device_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%7D; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1689752829,1689821665,1689831487,1689845884; searchSessionId=1689845917.81838207; HWWAFSESID=146bb1d25b1515339d3; HWWAFSESTIME=1689858023324; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1689859758',
+        'Host': 'capi.tianyancha.com',
+        'Origin': 'https://www.tianyancha.com',
+        'Referer': 'https://www.tianyancha.com/',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
+}
+
+def beinWork(tyc_code,social_code):
+    time.sleep(3)
+    retData={'state':False,'total':0,'okCount':0,'errorCount':0,'repetCount':0}
+    t=time.time()
+    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
+    for m in range(0, 3):
+        try:
+            ip = baseCore.get_proxy()
+            headers['User-Agent']=baseCore.getRandomUserAgent()
+            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
+            # time.sleep(random.randint(3, 5))
+            break
+        except Exception as e :
+            log.error("request请求异常----m-----{e}")
+            pass
+
+    if (response.status_code == 200):
+        pass
+    else:
+        log.error(f"{tyc_code}-----获取总数接口失败")
+        return retData
+    try:
+        json_1 = json.loads(response.content.decode('utf-8'))
+        total = json_1['data']['total']
+    except:
+        log.error(f"{tyc_code}-----获取总数失败")
+        return retData
+    if (total > 0):
+        if (total % pageSize == 0):
+            totalPage = total // pageSize
+        else:
+            totalPage = total // pageSize + 1
+    else:
+        log.error(f"{tyc_code}--------总数为0")
+        retData['state']=True
+        return retData
+    log.info(f"{tyc_code}-------总数：{total}----总页数:{totalPage}")
+
+    retData['total']=total
+    okCount = 0
+    errorCount = 0
+    repetCount = 0
+    for num in range(1, totalPage+1):
+        time.sleep(3)
+        log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
+        start_page = time.time()
+        url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
+        ip = baseCore.get_proxy()
+        for m in range(0, 3):
+            try:
+                headers['User-Agent']=baseCore.getRandomUserAgent()
+                response_page = requests.get(url=url_page,headers=headers, proxies=ip, verify=False)
+                # time.sleep(3)
+                break
+            except:
+                pass
+
+        if (response_page.status_code == 200):
+            pass
+        else:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            errorCount=errorCount+pageSize
+            continue
+        try:
+            json_page = json.loads(response_page.content.decode('utf-8'))
+            info_list_page = json_page['data']['items']
+        except:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            errorCount = errorCount + pageSize
+            continue
+        pageIndex=0
+        for info_page in info_list_page:
+            pageIndex=pageIndex+1
+            title = info_page['title']
+            source = info_page['website']
+            link = info_page['uri']
+
+            sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
+            cursor.execute(sel_sql, (link, social_code))
+
+            selects = cursor.fetchall()
+            if selects:
+                log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
+                repetCount = repetCount + 1 
+                continue
+            try:
+                time_struct = time.localtime(int(info_page['rtm'] / 1000))  # 首先把时间戳转换为结构化时间
+                time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct)  # 把结构化时间转换为格式化时间
+            except:
+                time_format = baseCore.getNowTime(1)
+            try:
+                # 开始进行智能解析
+                contentText = smart.extract_by_url(link).text
+                # time.sleep(3)
+            except Exception as e:
+                contentText = ''
+            if contentText == '':
+                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
+                errorCount = errorCount + 1
+                try:
+                    insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
+                    cursor.execute(insert_err_sql)
+                    cnx.commit()
+                except:
+                    pass
+                continue
+            try:
+                insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+                # 动态信息列表
+                okCount = okCount + 1
+
+                list_info = [
+                    social_code,
+                    title,
+                    info_page['abstracts'],  # 摘要
+                    contentText,  # 正文
+                    time_format,  # 发布时间
+                    link,
+                    '天眼查',
+                    source,
+                    '2',
+                    'zh'
+                ]
+                cursor.execute(insert_sql, tuple(list_info))
+                cnx.commit()
+                # 采集一条资讯记录一条，记录该企业采到了多少的资讯
+                log.info(f'{social_code}----{link}:新增一条')
+            except Exception as e:
+                log.info(f'传输失败:{social_code}----{link}')
+        log.info(f"获取分页数据--{tyc_code}----分页{num}，耗时{baseCore.getTimeCost(start_page, time.time())}")
+
+
+
+    retData['state'] = True
+    retData['okCount'] = okCount
+    retData['errorCount'] = errorCount
+    retData['repetCount'] = repetCount
+    return  retData
+
+def doJob():
+
+    while True:
+        selectSql = f"select id,xydm,tycid from ssqy_tyc where state=1  order by date_time asc  limit 1"
+        cursor.execute(selectSql)
+        data = cursor.fetchone()
+        if (data):
+            pass
+        else:
+            log.info("没有数据了，结束脚本")
+            break
+        data_list = list(data)
+        id = data_list[0]
+        xydm = data_list[1]
+        tycid = data_list[2]
+        log.info(f"{id}---{xydm}----{tycid}----开始处理")
+        start_time = time.time()
+        updateBeginSql = f"update ssqy_tyc set state=2,date_time=now() where id={id}"
+        cursor.execute(updateBeginSql)
+        cnx.commit()
+
+        # 开始采集企业动态
+        retData = beinWork(tycid, xydm)
+        state = retData['state']
+        total = retData['total']
+        okCount = retData['okCount']
+        errorCount = retData['errorCount']
+        repetCount = retData['repetCount']
+
+        if state:
+            stateNum = 3
+        else:
+            stateNum = 4
+
+        updateEndSql = f"update ssqy_tyc set state={stateNum},total={total},okCount={okCount},errorCount={errorCount},repetCount={repetCount} ,date_time=now() where id={id}"
+        cursor.execute(updateEndSql)
+        cnx.commit()
+        log.info(f"{id}---{xydm}----{tycid}----结束处理，耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{okCount}----失败数:{errorCount}--重复数:{repetCount}")
+
+    cursor.close()
+    cnx.close()
+    #释放资源
+    baseCore.close()
+
+
+# Press the green button in the gutter to run the script.
+
+
+if __name__ == '__main__':
+    doJob()
+
+