公告动态自动化

793652a0 · 薛凌堃 · 610b0b53 · 793652a0 · 793652a0 · 793652a0
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
+# 核心工具包
 import os
 import random
 import socket
@@ -5,13 +6,18 @@ import sys
 import time
 import logbook
 import logbook.more
-# 核心工具包
+import zhconv
+
 import pymysql
 import redis
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service

+
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
+import langid
+
+
 class BaseCore:
    # 序列号
    __seq = 0
@@ -211,8 +217,16 @@ class BaseCore:
        'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
    ]

-    # 连接到Redis
-    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+    def __init__(self):
+        self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
+                                           charset='utf8mb4')
+        self.__cursor_proxy = self.__cnx_proxy.cursor()
+        self.cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
+                                   charset='utf8mb4')
+
+        self.cursor = self.cnx.cursor()
+        # 连接到Redis
+        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)

    def close(self):
        try:
@@ -222,15 +236,7 @@ class BaseCore:
            self.cnx.close()
        except :
            pass
-    def __init__(self):
-        self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
-                                           charset='utf8mb4')
-        self.__cursor_proxy = self.__cnx_proxy.cursor()
-        self.cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
-                                   charset='utf8mb4')

-        self.cursor = self.cnx.cursor()
-        pass

    # 计算耗时
    def getTimeCost(self,start, end):
@@ -354,28 +360,37 @@ class BaseCore:
                str = str[0:end+1]
        return str

-    # def pullDateFromSql(self):
-    #     query = "select SocialCode from EnterpriseInfo "
-    #     self.cursor.execute(query)
-    #     result  = self.cursor.fetchall()
-    #     social_list = list(result)
-    #     return social_list
-    #
-    # def redisPushData(self,social_list):
-    #
-    #     #将数据插入到redis中
-    #     for item in social_list:
-    #         self.r.rpush('qy_socialCode', item)
+    # 繁体字转简体字
+    def hant_2_hans(hant_str: str):
+        '''
+        Function: 将 hant_str 由繁体转化为简体
+        '''
+        return zhconv.convert(hant_str, 'zh-hans')

-    # 从Redis的List中获取并移除一个元素
-    def redicPullData(self,type):
+    # 判断字符串里是否含数字
+    def str_have_num(str_num):
+        panduan = False

-        if type == 1:
-            gn_item = self.r.lpop('gnqy_socialCode')
-            return gn_item.decode() if gn_item else None
-        if type == 2:
-            gw_item = self.r.lpop('gwqy_socialCode')
-            return gw_item.decode() if gw_item else None
+        for str_1 in str_num:
+            ppp = str_1.isdigit()
+            if ppp:
+                panduan = ppp
+        return panduan
+
+    # # 从Redis的List中获取并移除一个元素
+    # def redicPullData(self,type,key):
+    # #1 表示国内 2 表示国外
+    #     if type == 1:
+    #         gn_item = self.r.lpop(key)
+    #         return gn_item.decode() if gn_item else None
+    #     if type == 2:
+    #         gw_item = self.r.lpop(key)
+    #         return gw_item.decode() if gw_item else None
+
+    # 从Redis的List中获取并移除一个元素
+    def redicPullData(self,key):
+        item = self.r.lpop(key)
+        return item.decode() if item else None

    # 获得脚本进程PID
    def getPID(self):
@@ -401,8 +416,9 @@ class BaseCore:
            "excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
-        chrome_options.add_argument(
-            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
+
+        chrome_options.add_argument(self.getRandomUserAgent())
+            # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
        driver = webdriver.Chrome(chrome_options=chrome_options, service=service)
        with open('../../base/stealth.min.js') as f:
            js = f.read()
@@ -438,4 +454,20 @@ class BaseCore:
            print(e)
        self.cnx.commit()

+    def GetToken(self):
+        #获取企查查token
+        query = "select token from QCC_token "
+        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
+        self.cursor.execute(query)
+        token = self.cursor.fetchone()[0]
+
+    def detect_language(self, text):
+        # 使用langid.py判断文本的语言
+        result = langid.classify(text)
+        if result == '':
+            return 'cn'
+        if result[0] == '':
+            return 'cn'
+        return result[0]
+

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
 import time

-import pymysql
-import redis
 from base import BaseCore
 from apscheduler.schedulers.blocking import BlockingScheduler

-
 basecore = BaseCore.BaseCore()
 log = basecore.getLogger()
+cnx = basecore.cnx
+cursor = basecore.cursor
+r = basecore.r

-# 连接到Redis
-r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+# # 连接到Redis
+# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+#
+# cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
+#                            charset='utf8mb4')
+# cursor = cnx.cursor()

-cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
-                           charset='utf8mb4')
-cursor = cnx.cursor()
+# def pullDateFromSql():
+#     gn_query = "select SocialCode from EnterpriseInfo where Place = '1' "
+#     cursor.execute(gn_query)
+#     gn_result = cursor.fetchall()
+#
+#     gw_query = "select SocialCode from EnterpriseInfo where Place = '2' "
+#     cursor.execute(gw_query)
+#     gw_result = cursor.fetchall()
+#
+#     gw_social_list = [item[0] for item in gw_result]
+#     gn_social_list = [item[0] for item in gn_result]
+#     return gn_social_list,gw_social_list

-def pullDateFromSql():
-    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
+def NewsEnterprise():
+    #获取国内企业
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
    cursor.execute(gn_query)
    gn_result = cursor.fetchall()
-
-    gw_query = "select SocialCode from EnterpriseInfo where Place = '2' limit 1 "
+    #获取国外企业
+    gw_query = "select SocialCode from EnterpriseInfo where Place = '2'"
    cursor.execute(gw_query)
    gw_result = cursor.fetchall()

    gw_social_list = [item[0] for item in gw_result]
    gn_social_list = [item[0] for item in gn_result]
-    return gn_social_list,gw_social_list
-
-def redisPushData():
+    # return gn_social_list, gw_social_list
    print('=======')
-    gn_social_list,gw_social_list = pullDateFromSql()
+    # gn_social_list,gw_social_list = pullDateFromSql()
    #将数据插入到redis中
    for item in gn_social_list:
-        r.rpush('gnqy_socialCode', item)
+        r.rpush('NewsEnterprise:gnqy_socialCode', item)

    for item in gw_social_list:
-        r.rpush('gwqy_socialCode', item)
-
-# 从Redis的List中获取并移除一个元素
-def redicPullData(type):
-    gn_item = r.lpop('gn_socialCode')
-    gw_item = r.lpop('gw_socialCode')
-    #1 表示国内  2 表示国外
-    if type==1:
-        return gn_item.decode() if gn_item else None
-    if type==2:
-        return gw_item.decode() if gw_item else None
-
-def task(task_time):
+        r.rpush('NewsEnterprise:gwqy_socialCode', item)
+
+def NewsEnterprise_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每天执行一次
+    scheduler.add_job(NewsEnterprise, 'cron', hour=12,minute=0,max_instances=2)
+    try:
+        # redisPushData  # 定时开始前执行一次
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+
+def NoticeEnterprise():
+    # 获取国内企业
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 1 "
+    cursor.execute(gn_query)
+    gn_result = cursor.fetchall()
+    gn_social_list = [item[0] for item in gn_result]
+    print('=======')
+    for item in gn_social_list:
+        r.rpush('NoticeEnterprise:gnqy_socialCode', item)
+
+def NoticeEnterprise_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每天执行一次
+    scheduler.add_job(NoticeEnterprise, 'cron', hour=12,minute=0)
+    try:
+        # redisPushData  # 定时开始前执行一次
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+
+def AnnualEnterprise():
+    # 获取国内企业
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
+    cursor.execute(gn_query)
+    gn_result = cursor.fetchall()
+    gn_social_list = [item[0] for item in gn_result]
+    print('=======')
+    for item in gn_social_list:
+        r.rpush('AnnualEnterprise:gnqy_socialCode', item)
+
+def AnnualEnterprise_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每年执行一次
+    scheduler.add_job(AnnualEnterprise, 'cron', second='*/10')
+    try:
+        # redisPushData  # 定时开始前执行一次
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+
+def BaseInfoEnterprise():
+    # 获取国内企业
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
+    cursor.execute(gn_query)
+    gn_result = cursor.fetchall()
+    gn_social_list = [item[0] for item in gn_result]
+    print('=======')
+    for item in gn_social_list:
+        r.rpush('BaseInfoEnterprise:gnqy_socialCode', item)
+
+#企业基本信息
+def BaseInfoEnterprise_task():
    # 实例化一个调度器
    scheduler = BlockingScheduler()
-    # 每半分钟执行一次
-    scheduler.add_job(redisPushData, 'cron', second=task_time, max_instances=3)
-    # 每天早上9点执行一次
-    # scheduler.add_job(self.auto_tb(), 'cron', day='*', hour=12, minute=5, start_date='2021-12-16 09:00:00',end_date='2023-11-30 23:59:59')
+    # 每年执行一次
+    scheduler.add_job(BaseInfoEnterprise, 'cron', second='*/10')
    try:
        # redisPushData  # 定时开始前执行一次
        scheduler.start()
@@ -63,8 +130,13 @@ def task(task_time):
        print('定时采集异常', e)
        pass

+
 if __name__ == "__main__":
    start = time.time()
-    task_time = '*/10'
-    task(task_time)
-    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')
\ No newline at end of file
+    # NewsEnterprise_task()
+    # NoticeEnterprise_task()
+    AnnualEnterprise_task()
+    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')
+    # cnx.close()
+    # cursor.close()
+    # basecore.close()
--- a/base/smart/smart_extractor.py
+++ b/base/smart/smart_extractor.py
@@ -50,7 +50,43 @@ class SmartExtractor:
        构造器：未指定 lang_code 参数时，默认为 cn
        """
        # 支持语言
-        self.goose = Goose({'stopwords_class': StopWordsChinese})
+        supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
+
+        # 初始化 goose 对象：
+        # 1、根据语言代码，创建 goose 对象
+        if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn'or lang_code == 'zh':
+            # 需要分词：中文
+            # 1、不指定lang_code参数，或不指定lang_code为 None 时，默认为中文分词
+            # 2、Flask Web接口：未指定get参数 lang_code 时，lang_code 会接收为 None
+            self.goose = Goose({'stopwords_class': StopWordsChinese})
+        elif lang_code == 'ko':
+            # 需要分词：韩语
+            # 1、测试：只传递语言，不传递分词器
+            # self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'})  # 测试失败：正文采集为空
+            # self.goose = Goose()    # 测试失败：正文采集为空
+            # 韩语分词：测试成功
+            self.goose = Goose({'stopwords_class': StopWordsKorean})
+        elif lang_code == 'ar':
+            # 需要分词：阿拉伯语
+            # self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})       # 测试失败：正文采集为空
+            # self.goose = Goose()    # 测试成功
+            # self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})  # 测试成功：直接传递语言编码
+            self.goose = Goose({'stopwords_class': StopWordsArabic})
+        elif lang_code == 'en':
+            # 单独测试：英文
+            # self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
+            # 测试成功：创建Goose对象时，不指定语言默认为英文分词
+            self.goose = Goose()
+        elif lang_code == 'ru':
+            # 单独测试：俄语
+            # self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})       # 测试失败：正文采集为空
+            self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})  # 测试成功：直接传递语言编码
+        elif lang_code in supported_lang_code_list:
+            # 其它语言编码，统一处理，不再单独测试
+            self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
+        else:
+            # 未识别的语言代码
+            raise Exception(f'智能采集时，无法识别语言代码：{lang_code}')

    def get_extraction_result(self, article, link_text=''):
        """

--- a/comData/BaseInfo_qcc/getQccId.py
+++ b/comData/BaseInfo_qcc/getQccId.py
+
+# -*- coding: utf-8 -*-
+
+import time
+from urllib.parse import quote
+import requests
+import urllib3
+
+
+headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-Timestamp': '',
+        'Qcc-Version': '1.0.0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+        'content-type': 'application/json',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br,'
+    }
+# 通过企业名称或信用代码获取企查查id
+def find_id_by_name(name):
+    urllib3.disable_warnings()
+
+    qcc_key = name
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
+    for lll in range(1, 6):
+        try:
+            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+            break
+        except:
+            print('重试')
+            time.sleep(5)
+            continue
+    time.sleep(2)
+    if resp_dict['result']['Result']:
+        result_dict = resp_dict['result']['Result'][0]
+        KeyNo = result_dict['KeyNo']
+        Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+        if Name == '':
+            KeyNo = ''
+    else:
+        KeyNo = ''
+
+    print("{}，企业代码为:{}".format(qcc_key, KeyNo))
+    return KeyNo
\ No newline at end of file
--- a/comData/annualReport_ZJH/证监会-年报.py
+++ b/comData/annualReport_ZJH/证监会-年报.py
--- a/comData/noticeReport_ZJH/证监会-公告.py
+++ b/comData/noticeReport_ZJH/证监会-公告.py
-"""
+"""
@@ -28,11 +28,11 @@ cursor_ = cnx_.cursor()
 tracker_conf = get_tracker_conf('./client.conf')
 client = Fdfs_client(tracker_conf)

+taskType = '企业公告/证监会'

-def RequestUrl(url, payload, social_code):
+def RequestUrl(url, payload, social_code,start_time):
    # ip = get_proxy()[random.randint(0, 3)]
-    start_time_url = time.time()
-    taskType = '公告'
+
    for m in range(0, 3):
        try:
            response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
@@ -46,19 +46,17 @@ def RequestUrl(url, payload, social_code):
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
+        soup = BeautifulSoup(response.text, 'html.parser')
        pass
    else:
        # 请求失败，输出错误信息
        log.error('请求失败:', url)
        state = 0
-        takeTime = baseCore.getTimeCost(start_time_url, time.time())
+        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
-
-    soup = BeautifulSoup(response.text, 'html.parser')
-
+        soup = ''
    return soup

-
 def getUrl(code, url_parms, Catagory2_parms):
    # 深市
    if code[0] == '2' or code[0] == '0' or code[0] == '3':
@@ -147,28 +145,33 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
        return inster

    # 信息插入数据库
-    insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
-
-    list_info = [
-        social_code,
-        name_pdf,
-        '',  # 摘要
-        '',  # 正文
-        pub_time,  # 发布时间
-        pdf_url,
-        '证监会',
-        report_type,
-        '1',
-        'zh'
-    ]
-    cursor_.execute(insert_sql, tuple(list_info))
-    cnx_.commit()
-    insert = True
-    return insert
+    try:
+        insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+
+        list_info = [
+            social_code,
+            name_pdf,
+            '',  # 摘要
+            '',  # 正文
+            pub_time,  # 发布时间
+            pdf_url,
+            '证监会',
+            report_type,
+            '1',
+            'zh'
+        ]
+        cursor_.execute(insert_sql, tuple(list_info))
+        cnx_.commit()
+        insert = True
+        return insert
+    except:
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
+        return insert


 def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
-    taskType = '公告'
    sel_sql = "select article_id from brpa_source_article where source_address = %s"
    cursor_.execute(sel_sql, pdf_url)
    row = cursor_.fetchone()
@@ -251,7 +254,8 @@ def SpiderByZJH(url, payload, dic_info, start_time):  # dic_info 数据库中获
    short_name = dic_info[4]

    soup = RequestUrl(url, payload, social_code, start_time)
-
+    if soup == '':
+        return
    # 先获取页数

    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
@@ -274,6 +278,8 @@ def SpiderByZJH(url, payload, dic_info, start_time):  # dic_info 数据库中获
            href = url.split('index')[0] + f'index_{i}_f.html'

        soup = RequestUrl(href, payload, social_code, start_time)
+        if soup == '':
+            continue
        tr_list = soup.find('div', id='txt').find_all('tr')
        pageIndex = 0
        for tr in tr_list[1:]:
@@ -303,7 +309,7 @@ def SpiderByZJH(url, payload, dic_info, start_time):  # dic_info 数据库中获
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, '公告', state, takeTime, pdf_url, '')
+                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
                    pass
                else:
                    errorCount += 1
@@ -354,8 +360,9 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        social_code = ''
-        if social_code == '':
+        social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
+        # 判断 如果Redis中已经没有数据，则等待
+        if social_code == None:
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)

--- a/comData/tcyQydt/test.py
+++ b/comData/tcyQydt/test.py
 import json

+from bs4 import BeautifulSoup
+import langid
+
 from base.BaseCore import BaseCore
+baseCore =BaseCore()
+# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
+
+
+#
+# def detect_language(text):
+#     # 使用langid.py判断文本的语言
+#     lang, confidence = langid.classify(text)
+#     print(lang,confidence)
+#     return lang
+# detect_language("123")

-s='jQuery1124020359136113854692_1688967721474({"rc":0,"rt":6,"svr":182993358,"lt":1,"full":1,"dlmkts":"","data":{"total":5488,"diff":[{"f1":2,"f2":35.37,"f3":130.87,"f4":20.05,"f5":505082,"f6":1561753667.0,"f7":72.85,"f8":73.63,"f9":79.87,"f10":"-","f11":-0.34,"f12":"603119","f13":1,"f14":"N\xe6\xb5\x99\xe8\x8d\xa3","f15":37.54,"f16":26.38,"f17":28.88,"f18":15.32,"f20":9903600000,"f21":2426214099,"f22":-0.03,"f23":6.46,"f24":130.87,"f25":130.87,"f62":503279629.0,"f115":70.77,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":70.7,"f3":26.98,"f4":15.02,"f5":278191,"f6":2015432017.69,"f7":19.83,"f8":73.92,"f9":44.38,"f10":"-","f11":0.41,"f12":"301371","f13":0,"f14":"N\xe6\x95\xb7\xe5\xb0\x94\xe4\xbd\xb3","f15":80.04,"f16":69.0,"f17":80.0,"f18":55.68,"f20":28285656000,"f21":2660599297,"f22":0.11,"f23":5.64,"f24":26.98,"f25":26.98,"f62":476657031.0,"f115":33.47,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":27.6,"f3":20.0,"f4":4.6,"f5":135775,"f6":348360366.27,"f7":21.04,"f8":33.94,"f9":212.8,"f10":3.1,"f11":0.0,"f12":"301316","f13":0,"f14":"\xe6\x85\xa7\xe5\x8d\x9a\xe4\xba\x91\xe9\x80\x9a","f15":27.6,"f16":22.76,"f17":23.11,"f18":23.0,"f20":11040276000,"f21":1104274261,"f22":0.0,"f23":11.68,"f24":18.1,"f25":44.43,"f62":107348086.0,"f115":124.43,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":43.62,"f3":20.0,"f4":7.27,"f5":75204,"f6":311935188.44,"f7":21.79,"f8":29.67,"f9":56.11,"f10":13.27,"f11":0.0,"f12":"301289","f13":0,"f14":"\xe5\x9b\xbd\xe7\xbc\x86\xe6\xa3\x80\xe6\xb5\x8b","f15":43.62,"f16":35.7,"f17":36.61,"f18":36.35,"f20":3402360000,"f21":1105762682,"f22":0.0,"f23":3.86,"f24":28.26,"f25":35.55,"f62":80534335.0,"f115":47.25,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":40.98,"f3":20.0,"f4":6.83,"f5":118733,"f6":464542197.42,"f7":20.73,"f8":40.73,"f9":56.02,"f10":2.57,"f11":0.0,"f12":"300881","f13":0,"f14":"\xe7\x9b\x9b\xe5\xbe\xb7\xe9\x91\xab\xe6\xb3\xb0","f15":40.98,"f16":33.9,"f17":33.9,"f18":34.15,"f20":4507800000,"f21":1194567000,"f22":0.0,"f23":5.48,"f24":23.81,"f25":42.05,"f62":16802132.0,"f115":56.01,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":21.0,"f3":19.45,"f4":3.42,"f5":50301,"f6":97244231.42,"f7":16.1,"f8":16.87,"f9":46.64,"f10":1.95,"f11":1.35,"f12":"873576","f13":0,"f14":"\xe5\xa4\xa9\xe5\x8a\x9b\xe5\xa4\x8d\xe5\x90\x88","f15":21.0,"f16":18.17,"f17":18.18,"f18":17.58,"f20":2247000000,"f21":626162250,"f22":0.72,"f23":5.16,"f24":50.21,"f25":50.21,"f62":11286257.0,"f115":29.96,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":76.8,"f3":16.21,"f4":10.71,"f5":153518,"f6":1100431330.98,"f7":23.24,"f8":73.58,"f9":190.79,"f10":1.6,"f11":0.27,"f12":"301315","f13":0,"f14":"\xe5\xa8\x81\xe5\xa3\xab\xe9\xa1\xbf","f15":79.31,"f16":63.95,"f17":63.95,"f18":66.09,"f20":6758400000,"f21":1602347750,"f22":0.17,"f23":7.03,"f24":137.84,"f25":137.84,"f62":112419255.0,"f115":102.68,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":72.99,"f3":16.17,"f4":10.16,"f5":106236,"f6":714127513.24,"f7":23.68,"f8":52.41,"f9":123.41,"f10":1.71,"f11":0.4,"f12":"301141","f13":0,"f14":"\xe4\xb8\xad\xe7\xa7\x91\xe7\xa3\x81\xe4\xb8\x9a","f15":74.88,"f16":60.0,"f17":62.85,"f18":62.83,"f20":6466528467,"f21":1479619267,"f22":0.07,"f23":3.14,"f24":96.74,"f25":78.02,"f62":-26422445.0,"f115":87.31,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":27.3,"f3":12.81,"f4":3.1,"f5":171865,"f6":442577004.48,"f7":15.25,"f8":7.3,"f9":-156.2,"f10":0.94,"f11":-0.15,"f12":"300551","f13":0,"f14":"\xe5\x8f\xa4\xe9\xb3\x8c\xe7\xa7\x91\xe6\x8a\x80","f15":27.55,"f16":23.86,"f17":24.2,"f18":24.2,"f20":9439055235,"f21":6427896275,"f22":-0.11,"f23":8.93,"f24":48.37,"f25":133.73,"f62":16013778.0,"f115":-126.12,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":84.3,"f3":12.18,"f4":9.15,"f5":124022,"f6":989104033.4,"f7":17.33,"f8":64.35,"f9":99.53,"f10":1.15,"f11":0.19,"f12":"301398","f13":0,"f14":"\xe6\x98\x9f\xe6\xba\x90\xe5\x8d\x93\xe9\x95\x81","f15":86.5,"f16":73.48,"f17":75.48,"f18":75.15,"f20":6744000000,"f21":1624735481,"f22":-0.04,"f23":6.81,"f24":157.88,"f25":173.35,"f62":-26812467.0,"f115":105.29,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":34.85,"f3":10.95,"f4":3.44,"f5":27626,"f6":95746251.0,"f7":9.87,"f8":7.18,"f9":-37.27,"f10":9.74,"f11":-0.03,"f12":"688622","f13":1,"f14":"\xe7\xa6\xbe\xe4\xbf\xa1\xe4\xbb\xaa\xe5\x99\xa8","f15":36.0,"f16":32.9,"f17":35.0,"f18":31.41,"f20":2439416569,"f21":1341637317,"f22":-0.03,"f23":4.74,"f24":-5.76,"f25":7.23,"f62":18152096.0,"f115":-36.22,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":87.8,"f3":10.66,"f4":8.46,"f5":22037,"f6":184811228.0,"f7":11.33,"f8":6.52,"f9":116.36,"f10":4.84,"f11":1.09,"f12":"688776","f13":1,"f14":"\xe5\x9b\xbd\xe5\x85\x89\xe7\x94\xb5\xe6\xb0\x94","f15":87.99,"f16":79.0,"f17":79.0,"f18":79.34,"f20":9516064188,"f21":2968587801,"f22":-0.22,"f23":5.39,"f24":-5.88,"f25":-29.79,"f62":2907315.0,"f115":65.69,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.05,"f3":10.22,"f4":0.19,"f5":3258788,"f6":657251653.18,"f7":9.68,"f8":6.48,"f9":-12.82,"f10":3.95,"f11":0.0,"f12":"000413","f13":0,"f14":"\xe4\xb8\x9c\xe6\x97\xad\xe5\x85\x89\xe7\x94\xb5","f15":2.05,"f16":1.87,"f17":1.87,"f18":1.86,"f20":11547137393,"f21":10310048690,"f22":0.0,"f23":0.52,"f24":17.82,"f25":15.82,"f62":213263692.0,"f115":-8.55,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.7,"f3":10.2,"f4":0.25,"f5":1107878,"f6":291343381.08,"f7":11.84,"f8":7.94,"f9":-19.65,"f10":2.01,"f11":0.0,"f12":"002256","f13":0,"f14":"\xe5\x85\x86\xe6\x96\xb0\xe8\x82\xa1\xe4\xbb\xbd","f15":2.7,"f16":2.41,"f17":2.44,"f18":2.45,"f20":5082512054,"f21":3769280384,"f22":0.0,"f23":4.31,"f24":11.11,"f25":12.97,"f62":96164236.0,"f115":-99.3,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.92,"f3":10.19,"f4":0.27,"f5":1178068,"f6":333498626.0,"f7":9.06,"f8":7.34,"f9":7.63,"f10":1.4,"f11":0.0,"f12":"600239","f13":1,"f14":"\xe4\xba\x91\xe5\x8d\x97\xe5\x9f\x8e\xe6\x8a\x95","f15":2.92,"f16":2.68,"f17":2.69,"f18":2.65,"f20":4688605774,"f21":4688605774,"f22":0.0,"f23":2.89,"f24":28.07,"f25":51.3,"f62":27795948.0,"f115":-16.59,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":3.15,"f3":10.14,"f4":0.29,"f5":2973491,"f6":920586623.66,"f7":8.74,"f8":28.9,"f9":-7.07,"f10":4.18,"f11":0.0,"f12":"002630","f13":0,"f14":"\xe5\x8d\x8e\xe8\xa5\xbf\xe8\x83\xbd\xe6\xba\x90","f15":3.15,"f16":2.9,"f17":2.95,"f18":2.86,"f20":3719520000,"f21":3240482440,"f22":0.0,"f23":4.9,"f24":26.51,"f25":7.14,"f62":-18293260.0,"f115":-5.07,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.79,"f3":10.11,"f4":0.44,"f5":1857359,"f6":864538200.0,"f7":10.8,"f8":9.31,"f9":24.64,"f10":9.05,"f11":0.0,"f12":"600577","f13":1,"f14":"\xe7\xb2\xbe\xe8\xbe\xbe\xe8\x82\xa1\xe4\xbb\xbd","f15":4.79,"f16":4.32,"f17":4.35,"f18":4.35,"f20":9959122877,"f21":9559956211,"f22":0.0,"f23":2.07,"f24":14.05,"f25":16.26,"f62":161845983.0,"f115":26.21,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.36,"f3":10.1,"f4":0.4,"f5":617159,"f6":264661451.0,"f7":11.62,"f8":2.74,"f9":122.48,"f10":3.79,"f11":0.0,"f12":"601777","f13":1,"f14":"\xe5\x8a\x9b\xe5\xb8\x86\xe7\xa7\x91\xe6\x8a\x80","f15":4.36,"f16":3.9,"f17":3.95,"f18":3.96,"f20":19931840280,"f21":9811962000,"f22":0.0,"f23":1.94,"f24":24.22,"f25":12.95,"f62":41966291.0,"f115":137.9,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":3.27,"f3":10.1,"f4":0.3,"f5":290547,"f6":93712867.6,"f7":8.08,"f8":2.28,"f9":1394.52,"f10":1.03,"f11":0.0,"f12":"002175","f13":0,"f14":"\xe4\xb8\x9c\xe6\x96\xb9\xe6\x99\xba\xe9\x80\xa0","f15":3.27,"f16":3.03,"f17":3.04,"f18":2.97,"f20":4175072977,"f21":4175040277,"f22":0.0,"f23":8.53,"f24":13.54,"f25":-8.66,"f62":52561839.0,"f115":41.98,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.51,"f3":10.09,"f4":0.23,"f5":1715205,"f6":423246793.0,"f7":10.96,"f8":5.97,"f9":-4.84,"f10":2.8,"f11":0.0,"f12":"600569","f13":1,"f14":"\xe5\xae\x89\xe9\x98\xb3\xe9\x92\xa2\xe9\x93\x81","f15":2.51,"f16":2.26,"f17":2.26,"f18":2.28,"f20":7209777679,"f21":7209777679,"f22":0.0,"f23":1.02,"f24":17.84,"f25":21.26,"f62":88473646.0,"f115":-2.55,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2}]}});'
+from textblob import TextBlob

-baseCore = BaseCore()
-jsondata =baseCore.getSubStr(s,'{','}')
+def detect_language(text):
+    blob = TextBlob(text)
+    lang = blob.detect_language()
+    return lang

-retJsonData = json.loads(jsondata)
+text = "Hello, how are you?"
+language = detect_language(text)
+print(language)

-dataList = retJsonData['data']['diff']
-print(len(dataList))

-for dataIndex in range(len(dataList)):
-    print(f"{dataList[dataIndex]['f12']}----{dataList[dataIndex]['f14']}")
\ No newline at end of file
--- a/comData/tcyQydt/tyc_qydt_add.py
+++ b/comData/tcyQydt/tyc_qydt_add.py
@@ -31,7 +31,7 @@ import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 jieba.cut("必须加载jieba")
 # 初始化，设置中文分词
-smart = smart_extractor.SmartExtractor('cn')
+
 baseCore = BaseCore()
 log = baseCore.getLogger()
 cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
@@ -51,9 +51,10 @@ headers = {
 }


+taskType = '企业动态/天眼查'
+
 def beinWork(tyc_code, social_code):
    start_time = time.time()
-    taskType = '企业动态'
    time.sleep(3)
    # retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
    retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
@@ -67,10 +68,6 @@ def beinWork(tyc_code, social_code):
            # time.sleep(random.randint(3, 5))
            break
        except Exception as e:
-            log.error(f"request请求异常----{m}-----{e}")
-            state = 0
-            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
            pass

    if (response.status_code == 200):
@@ -87,7 +84,7 @@ def beinWork(tyc_code, social_code):
        total = json_1['data']['total']
    except:
        log.error(f"{tyc_code}-----获取总数失败")
-        e = '获取总是失败'
+        e = '获取总数失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
@@ -171,6 +168,8 @@ def beinWork(tyc_code, social_code):
                time_format = baseCore.getNowTime(1)
            try:
                # 开始进行智能解析
+                lang = baseCore.detect_language(title)
+                smart = smart_extractor.SmartExtractor(lang)
                contentText = smart.extract_by_url(link).text
                # time.sleep(3)
            except Exception as e:
@@ -236,44 +235,46 @@ def beinWork(tyc_code, social_code):
                    'socialCreditCode': social_code,
                    'year': time_format[:4]
                }
+            except Exception as e:
+                log.info(f'传输失败:{social_code}----{link}')
+                e = '数据库传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                continue
                # print(dic_news)
                # 将相应字段通过kafka传输保存
-                try:
-                    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
-                    kafka_result = producer.send("researchReportTopic",
-                                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                kafka_result = producer.send("researchReportTopic",
+                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

-                    print(kafka_result.get(timeout=10))
+                print(kafka_result.get(timeout=10))

-                    dic_result = {
-                        'success': 'ture',
-                        'message': '操作成功',
-                        'code': '200',
-                    }
-                    log.info(dic_result)
-                    # 传输成功,写入日志中
-                    state = 1
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
-                    # return True
-                except Exception as e:
-                    dic_result = {
-                        'success': 'false',
-                        'message': '操作失败',
-                        'code': '204',
-                        'e': e
-                    }
-                    log.error(dic_result)
-                    e = str(e) + '操作失败'
-                    state = 0
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                dic_result = {
+                    'success': 'ture',
+                    'message': '操作成功',
+                    'code': '200',
+                }
+                log.info(dic_result)
+                # 传输成功,写入日志中
+                state = 1
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
+                # return True
            except Exception as e:
-                log.info(f'传输失败:{social_code}----{link}')
-                e = '传输失败'
+                dic_result = {
+                    'success': 'false',
+                    'message': '操作失败',
+                    'code': '204',
+                    'e': e
+                }
+                log.error(dic_result)
+                e = 'Kafka操作失败'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+
        log.info(f"获取分页数据--{tyc_code}----分页{num}，耗时{baseCore.getTimeCost(start_page, time.time())}")


@@ -287,8 +288,9 @@ def beinWork(tyc_code, social_code):
 # 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
 def doJob():
    while True:
+
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        social_code = baseCore.redicPullData(1)
+        social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == 'None':
            time.sleep(20)

--- a/comData/yhcj/NewsYahooAuto.py
+++ b/comData/yhcj/NewsYahooAuto.py
--- a/comData/yhcj/雅虎财经_企业动态.py
+++ b/comData/yhcj/雅虎财经_企业动态.py