2023/8/12

5122cc37 · 薛凌堃 · 98ca1672 · 5122cc37 · 5122cc37 · 5122cc37
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -364,14 +364,14 @@ class BaseCore:
        return str
    # 繁体字转简体字
-    def hant_2_hans(hant_str: str):
+    def hant_2_hans(self,hant_str: str):
        '''
        Function: 将 hant_str 由繁体转化为简体
        '''
        return zhconv.convert(hant_str, 'zh-hans')
    # 判断字符串里是否含数字
-    def str_have_num(str_num):
+    def str_have_num(self,str_num):
        panduan = False
        for str_1 in str_num:
@@ -463,6 +463,7 @@ class BaseCore:
        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
        self.cursor.execute(query)
        token = self.cursor.fetchone()[0]
+        return token
    #检测语言
    def detect_language(self, text):

--- a/comData/BaseInfo_qcc/fbsbaseinfo.py
+++ b/comData/BaseInfo_qcc/fbsbaseinfo.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import time
+import requests
+import json
+from kafka import KafkaProducer
+from base.BaseCore import BaseCore
+from getQccId import find_id_by_name
+baseCore = BaseCore()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+log = baseCore.getLogger()
+# 通过企查查id获取企业基本信息
+def info_by_id(com_id,com_name):
+    aa_dict_list = []
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
+    resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(2)
+    com_jc_name = ''
+    try:
+        result_dict = resp_dict['result']['Company']
+    except:
+        print(com_name + ":获取失败")
+    #
+    company_name = result_dict['Name']
+    CreditCode = result_dict['CreditCode']
+    if CreditCode is None:
+        CreditCode = ''
+    try:
+        OperName = result_dict['Oper']['Name']
+    except:
+        OperName = ''
+    if OperName is None:
+        OperName = ''
+    if baseCore.str_have_num(OperName):
+        OperName = ''
+    try:
+        Status = result_dict['ShortStatus']
+    except:
+        Status = ''
+    if Status is None:
+        Status = ''
+    try:
+        StartDate = result_dict['StartDate']
+    except:
+        StartDate = ''
+    if StartDate is None:
+        StartDate = ''
+    try:
+        RegistCapi = result_dict['RegistCapi']
+    except:
+        RegistCapi = ''
+    if RegistCapi is None:
+        RegistCapi = ''
+    RecCap = ''  # result_dict['RecCap']  #实际缴纳金额，现已没有显示
+    if RecCap is None:
+        RecCap = ''
+    try:
+        OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2]  # 组织机构代码，现已没有显示
+    except:
+        OrgNo = ''
+    if OrgNo is None:
+        OrgNo = ''
+    try:
+        TaxNo = result_dict['TaxNo']
+    except:
+        TaxNo = ''
+    if TaxNo is None:
+        TaxNo = ''
+    try:
+        EconKind = result_dict['EconKind']
+    except:
+        EconKind = ''
+    if EconKind is None:
+        EconKind = ''
+    TermStart = ''  # result_dict['TermStart']  营业期限自，现已没有显示
+    if TermStart is None:
+        TermStart = ''
+    TeamEnd = ''  # result_dict['TeamEnd']营业期限至，现已没有显示
+    if TeamEnd is None:
+        TeamEnd = ''
+    try:
+        SubIndustry = result_dict['Industry']['SubIndustry']
+    except:
+        SubIndustry = ''
+    if SubIndustry is None:
+        SubIndustry = ''
+    try:
+        Province = result_dict['Area']['Province']
+    except:
+        Province = ''
+    try:
+        City = result_dict['Area']['City']
+    except:
+        City = ''
+    try:
+        County = result_dict['Area']['County']
+    except:
+        County = ''
+    try:
+        region = Province + City + County
+    except:
+        region = ''
+    BelongOrg = ''  # result_dict['BelongOrg']登记机关，现已没有显示
+    can_bao = ''
+    CommonList = []  # result_dict['CommonList']参保人数，现已没有显示
+    for Common_dict in CommonList:
+        try:
+            KeyDesc = Common_dict['KeyDesc']
+        except:
+            continue
+        if KeyDesc == '参保人数':
+            can_bao = Common_dict['Value']
+    if can_bao == '0':
+        can_bao = ''
+    OriginalName = ''
+    try:
+        OriginalName_lists = result_dict['OriginalName']
+        for OriginalName_dict in OriginalName_lists:
+            OriginalName += OriginalName_dict['Name'] + ' '
+    except:
+        OriginalName = ''
+    try:
+        OriginalName.strip()
+    except:
+        OriginalName = ''
+    EnglishName = ''  # result_dict['EnglishName']企业英文名，现已没有显示
+    if EnglishName is None:
+        EnglishName = ''
+    IxCode = ''  # result_dict['IxCode']进出口企业代码，现已没有显示
+    if IxCode is None:
+        IxCode = ''
+    Address = result_dict['Address']
+    if Address is None:
+        Address = ''
+    Scope = ''  # result_dict['Scope']经营范围，现已没有显示
+    if Scope is None:
+        Scope = ''
+    try:
+        PhoneNumber = result_dict['companyExtendInfo']['Tel']
+    except:
+        PhoneNumber = ''
+    if PhoneNumber is None:
+        PhoneNumber = ''
+    try:
+        WebSite = result_dict['companyExtendInfo']['WebSite']
+    except:
+        WebSite = None
+    if WebSite is None:
+        try:
+            WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
+        except:
+            WebSite = ''
+    try:
+        Email = result_dict['companyExtendInfo']['Email']
+    except:
+        Email = ''
+    if Email is None:
+        Email = ''
+    try:
+        Desc = result_dict['companyExtendInfo']['Desc']
+    except:
+        Desc = ''
+    if Desc is None:
+        Desc = ''
+    try:
+        Info = result_dict['companyExtendInfo']['Info']
+    except:
+        Info = ''
+    if Info is None:
+        Info = ''
+    company_name = baseCore.hant_2_hans(company_name)
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
+                                                                                                         com_id)
+    resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(1)
+    try:
+        com2 = resp_dict2['result']['Company']
+    except:
+        com2 = ''
+    try:
+        Scope = com2['Scope']
+    except:
+        Scope = ''
+    try:
+        CheckDate = com2['CheckDate']
+    except:
+        CheckDate = ''
+    if CheckDate is None:
+        CheckDate = ''
+    try:
+        TaxpayerType = com2['TaxpayerType']     #纳税人资质
+    except:
+        TaxpayerType = ''
+    if TaxpayerType is None:
+        TaxpayerType = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    if No is None:
+        No = ''
+    try:
+        IxCode = com2['IxCode']
+    except:
+        IxCode = ''
+    try:
+        OrgNo = com2['OrgNo']
+    except:
+        OrgNo = ''
+    try:
+        for Common_t in com2['CommonList']:
+            try:
+                if Common_t['KeyDesc'] == '参保人数':
+                    can_bao = Common_t['Value']
+            except:
+                pass
+    except:
+        can_bao = ''
+    try:
+        TermStart = com2['TermStart']
+    except:
+        TermStart = ''
+    try:
+        TeamEnd = com2['TeamEnd']
+    except:
+        TeamEnd = ''
+    try:
+        RecCap = com2['RecCap']
+    except:
+        RecCap = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    try:
+        SubIndustry = com2['IndustryArray'][-1]
+    except:
+        SubIndustry = ''
+    try:
+        BelongOrg = com2['BelongOrg']
+    except:
+        BelongOrg = ''
+    try:
+        EnglishName = com2['EnglishName']
+    except:
+        EnglishName = ''
+    aa_dict = {
+        'qccId': com_id,  # 企查查企业id
+        'name': company_name,  # 企业名称
+        'shortName': com_jc_name,  # 企业简称
+        'socialCreditCode': CreditCode,  # 统一社会信用代码
+        'legalPerson': OperName,  # 法定代表人
+        'officialPhone': PhoneNumber,  # 电话
+        'officialUrl': WebSite,  # 官网
+        'officialEmail': Email,  # 邮箱
+        'briefInfo': Desc,  # 简介
+        'registerStatus': Status,  # 登记状态
+        'incorporationDate': StartDate,  # 成立日期
+        'capital': RegistCapi,  # 注册资本
+        'paidCapital': RecCap,  # 实缴资本
+        'approvalDate': CheckDate,  # 核准日期
+        'organizationCode': OrgNo,  # 组织机构代码
+        'registerNo': No,  # 工商注册号
+        'taxpayerNo': CreditCode,  # 纳税人识别号
+        'type': EconKind,  # 企业类型
+        'businessStartDate': TermStart,  # 营业期限自
+        'businessEndDate': TeamEnd,  # 营业期限至
+        'taxpayerQualification': TaxpayerType,  # 纳税人资质
+        'industry': SubIndustry,  # 所属行业
+        'region': region,
+        'province': Province,  # 所属省
+        'city': City,  # 所属市
+        'county': County,  # 所属县
+        'registerDepartment': BelongOrg,  # 登记机关
+        'scale': Info,  # 人员规模
+        'insured': can_bao,  # 参保人数
+        'beforeName': OriginalName,  # 曾用名
+        'englishName': EnglishName,  # 英文名
+        'importExportEnterpriseCode': IxCode,  # 进出口企业代码
+        'address': Address,  # 地址
+        'businessRange': Scope,  # 经营范围
+        'status': 0,  # 状态
+    }
+    aa_dict_list.append(aa_dict)
+    print(company_name + "：爬取完成")
+    return aa_dict_list
+if __name__ == '__main__':
+    taskType = '基本信息/企查查'
+    headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-Timestamp': '',
+        'Qcc-Version': '1.0.0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+        'content-type': 'application/json',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br,'
+    }
+    #从redis里拿数据
+    while True:
+        start = time.time()
+        # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
+        token = baseCore.GetToken()
+        list_weicha = []
+        list_all_info = []
+        name_list = []
+        start_time = time.time()
+        # 获取企业信息
+        query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%'   and  state1=1    limit 1 "
+        #兴业银行
+        # query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%' and col5='兴业银行'"
+        cursor.execute(query)
+        row = cursor.fetchone()
+        if row:
+            pass
+        else:
+            print('没有数据了，结束脚本')
+            break
+        com_name = row[6]
+        social_code = row[4]
+        code = row[7]
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
+        # print(updateBeginSql)
+        cursor.execute(updateBeginSql)
+        cnx.commit()
+        company_id = find_id_by_name(start,token,social_code)
+        if company_id == False:
+            #表示token失效
+            time.sleep(10)
+            updateBeginSql = f"update Tfbs set state1=1,date2='{time_now}' where col3='{social_code}' "
+            # print(updateBeginSql)
+            cursor.execute(updateBeginSql)
+            cnx.commit()
+            continue
+        if company_id == "":
+                log.info(com_name + "：企业ID获取失败")
+                list_weicha.append(com_name + "：企业ID获取失败")
+                #400表示企业更新失败
+                updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
+                # print(updateBeginSql)
+                cursor.execute(updateBeginSql)
+                cnx.commit()
+                continue
+        else:
+            post_data_list = info_by_id(company_id,social_code)
+        for post_data in post_data_list:
+            list_all_info.append(post_data)
+            if post_data is None:
+                log.info(com_name + "：企业信息获取失败")
+                list_weicha.append(com_name + "：企业信息获取失败")
+                # 400表示企业更新失败
+                updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
+                # print(updateBeginSql)
+                cursor.execute(updateBeginSql)
+                cnx.commit()
+                continue
+            get_name = post_data['name']
+            get_socialcode = post_data['socialCreditCode']
+            name_compile = {
+                'yuan_name':com_name,
+                'get_name':get_name
+            }
+            name_list.append(name_compile)
+            log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
+                kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
+                print(kafka_result.get(timeout=10))
+            except:
+                exception = 'kafka传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
+                log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
+        #200表示成功
+        updateBeginSql = f"update Tfbs set state1=200,date2='{time_now}' where col3='{social_code}' "
+        # print(updateBeginSql)
+        cursor.execute(updateBeginSql)
+        cnx.commit()
+    nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
+    companyName = pd.DataFrame(name_list)
+    companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
+    false_com = pd.DataFrame(list_weicha)
+    false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
--- a/comData/BaseInfo_qcc/getQccId.py
+++ b/comData/BaseInfo_qcc/getQccId.py
@@ -5,8 +5,10 @@ import time
 from urllib.parse import quote
 import requests
 import urllib3
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
 headers = {
        'Host': 'xcx.qcc.com',
        'Connection': 'keep-alive',
@@ -19,7 +21,7 @@ headers = {
        'Accept-Encoding': 'gzip, deflate, br,'
    }
 # 通过企业名称或信用代码获取企查查id
-def find_id_by_name(name):
+def find_id_by_name(start,token,name):
    urllib3.disable_warnings()
    qcc_key = name
@@ -35,14 +37,19 @@ def find_id_by_name(name):
            time.sleep(5)
            continue
    time.sleep(2)
-    if resp_dict['result']['Result']:
+    try:
-        result_dict = resp_dict['result']['Result'][0]
+        if resp_dict['result']['Result']:
-        KeyNo = result_dict['KeyNo']
+            result_dict = resp_dict['result']['Result'][0]
-        Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            KeyNo = result_dict['KeyNo']
-        if Name == '':
+            Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            if Name == '':
+                KeyNo = ''
+        else:
            KeyNo = ''
-    else:
+    except:
-        KeyNo = ''
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
+        return KeyNo
    print("{}，企业代码为:{}".format(qcc_key, KeyNo))
    return KeyNo
\ No newline at end of file
--- a/comData/annualReport_ZJH/CenterPerson/__init__.py
+++ b/comData/annualReport_ZJH/CenterPerson/__init__.py
--- a/comData/annualReport_ZJH/CenterPerson/核心人员.py
+++ b/comData/annualReport_ZJH/CenterPerson/核心人员.py
+import json
+import requests,time,re,random,pymysql
+import pandas as pd
+from bs4 import BeautifulSoup
+import urllib3
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+cnx = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
+cursor = cnx.cursor()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+def get_proxy():
+    sql = "select proxy from clb_proxy"
+    cursor.execute(sql)
+    proxy_lists = cursor.fetchall()
+    ip_list = []
+    for proxy_ in proxy_lists:
+        ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+    proxy_list = []
+    for str_ip in ip_list:
+        str_ip_list = str_ip.split('-')
+        proxyMeta = "http://%(host)s:%(port)s" % {
+            "host": str_ip_list[0],
+            "port": str_ip_list[1],
+        }
+        proxy = {
+            "HTTP": proxyMeta,
+            "HTTPS": proxyMeta
+        }
+        proxy_list.append(proxy)
+    return proxy_list
+headers = {
+    'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
+}
+list_code = []
+while True:
+        list_weicha = []
+        list_all_info = []
+        name_list = []
+        start_time = time.time()
+        # 获取企业信息
+        query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%'   and  state2 is null  limit 1 "
+        #兴业银行
+        # query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%' and col5='兴业银行'"
+        cursor_.execute(query)
+        row = cursor_.fetchone()
+        if row:
+            pass
+        else:
+            print('没有数据了，结束脚本')
+            break
+        com_name = row[6]
+        social_code = row[4]
+        code = row[7]
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
+        # print(updateBeginSql)
+        cursor_.execute(updateBeginSql)
+        cnx_.commit()
+        t = time.time()
+        ip = get_proxy()[random.randint(0,3)]
+        url_t = f'https://www.tianyancha.com/search?key={social_code}&sessionNo={t}'
+        res_t = requests.get(url_t,headers=headers, proxies=ip,verify=False)  #, proxies=ip,verify=False
+        time.sleep(10)
+        soup_t = BeautifulSoup(res_t.content, 'html.parser')
+        try:
+            com_id = soup_t.find('div',{'class':'index_header__x2QZ3'}).find('a').get('href').split('/')[-1]
+            print(f"{com_name}:{com_id}")
+        except:
+            com_id = '--'
+            print(f'{com_name}:没有查询到该企业')
+        #colext1获取天眼查id
+        updateBeginSql = f"update Tfbs set state2=0,colext1='{com_id}',date2='{time_now}' where col3='{social_code}' "
+        cursor_.execute(updateBeginSql)
+        cnx_.commit()
+        log.info(f'{com_name}===天眼查id更新入库===== ')
+        if com_id == '--':
+            continue
+        list_one_info = []
+        list_all_1 = []
+        list_all_2 = []
+        # 采集天眼查企业核心人员并通过接口入库
+        log.info('=====开始采集企业核心人员=======')
+        print(f'{social_code}:{com_id}')
+        num = 1
+        for page in range(1, 2):
+            t = int(time.time() * 1000)
+            url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
+            ip = get_proxy()[random.randint(0, 3)]
+            res = requests.get(url, headers=headers, proxies=ip)  # ,verify=False
+            time.sleep(10)
+            list_all = res.json()['data']['dataList']
+            if list_all:
+                for one_info in list_all:
+                    name = one_info['name']
+                    sex = one_info['sex']
+                    education = one_info['education']
+                    position = one_info['position']
+                    Salary = one_info['salary']
+                    try:
+                        birthYear = 2023 - int(one_info['age'])
+                    except:
+                        birthYear = ''
+                    StockKeepings = one_info['numberOfShares']
+                    currentTerm = one_info['term']
+                    personInfo = one_info['resume']
+                    try:
+                        person_img = one_info['logo']
+                    except:
+                        person_img = '--'
+                    dic_json = {
+                        "socialCreditCode": social_code,
+                        "name": name,
+                        "sex": sex,
+                        "education": education,
+                        "position": position,
+                        "salary": Salary,
+                        "birthYear": birthYear,
+                        "shareNum": StockKeepings,
+                        "shareRatio": '',
+                        "benefitShare": '',
+                        "currentTerm": currentTerm,
+                        "personInfo": personInfo,
+                        "sort": str(num)
+                    }
+                    dic_json_img = {
+                        "socialCreditCode": social_code,
+                        "name": name,
+                        "sex": sex,
+                        "education": education,
+                        "position": position,
+                        "salary": Salary,
+                        "birthYear": birthYear,
+                        "shareNum": StockKeepings,
+                        "shareRatio": '',
+                        "benefitShare": '',
+                        "currentTerm": currentTerm,
+                        "personInfo": personInfo,
+                        "头像": person_img,
+                        "sort": str(num)
+                    }
+                    num = num + 1
+                    list_one_info.append(dic_json)
+                    list_all_2.append(dic_json_img)
+            else:
+                t = int(time.time() * 1000)
+                url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
+                ip = get_proxy()[random.randint(0, 3)]
+                res = requests.get(url, headers=headers, proxies=ip)  # ,verify=False
+                list_all = res.json()['data']['result']
+                for one_info in list_all:
+                    name = one_info['name']
+                    sex = ''
+                    education = ''
+                    position = one_info['typeSore']
+                    Salary = ''
+                    birthYear = ''
+                    shareRatio = one_info['percent']
+                    try:
+                        benefitShare = one_info['finalBenefitShares']
+                    except:
+                        benefitShare = ''
+                    person_id = one_info['id']
+                    person_url = f'https://www.tianyancha.com/human/{person_id}-c{com_id}'
+                    person_res = requests.get(person_url, headers=headers, proxies=ip)
+                    person_soup = BeautifulSoup(person_res.content, 'html.parser')
+                    try:
+                        personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
+                    except:
+                        personInfo = ''
+                    try:
+                        person_img = one_info['logo']
+                    except:
+                        person_img = '--'
+                    dic_json = {
+                        "socialCreditCode": social_code,
+                        "name": name,
+                        "sex": sex,
+                        "education": education,
+                        "position": position,
+                        "salary": Salary,
+                        "birthYear": birthYear,
+                        "shareNum": '',
+                        "shareRatio": shareRatio,
+                        "benefitShare": benefitShare,
+                        "currentTerm": '',
+                        "personInfo": personInfo,
+                        "sort": str(num)
+                    }
+                    dic_json_img = {
+                        "socialCreditCode": social_code,
+                        "name": name,
+                        "sex": sex,
+                        "education": education,
+                        "position": position,
+                        "salary": Salary,
+                        "birthYear": birthYear,
+                        "shareNum": '',
+                        "shareRatio": shareRatio,
+                        "benefitShare": benefitShare,
+                        "currentTerm": '',
+                        "personInfo": personInfo,
+                        "头像": person_img,
+                        "sort": str(num)
+                    }
+                    num = num + 1
+                    list_one_info.append(dic_json)
+                    list_all_2.append(dic_json_img)
+        log.info(f'{com_name}===该企业采集完成====')
+        df_info = pd.DataFrame(list_one_info)
+        df_info.to_excel('主要人员.xlsx', index=False)
+        json_updata = json.dumps(list_one_info)
+        if json_updata == '[]':
+            continue
+        else:
+            pass
+        response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
+                                 verify=False)
+        print(response.text)
+cnx.close()
+cursor.close()
+baseCore.close()
+# df_img = pd.DataFrame(list_all_2)
+# df_img.to_excel('企业主要人员-头像-23年500强新榜.xlsx',index=False)
--- a/comData/annualReport_ZJH/fbs_annualreport.py
+++ b/comData/annualReport_ZJH/fbs_annualreport.py
+from fdfs_client.client import get_tracker_conf, Fdfs_client
+from bs4 import BeautifulSoup
+import requests, re, time, pymysql,  fitz
+import urllib3
+from base import BaseCore
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+baseCore = BaseCore.BaseCore()
+# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
+cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
+cursor_ = cnx.cursor()
+cnx_ = baseCore.cnx
+cursor = baseCore.cursor
+tracker_conf = get_tracker_conf('./client.conf')
+client = Fdfs_client(tracker_conf)
+taskType = '企业年报/证监会'
+# def get_proxy():
+#     cursor = cnx_ip.cursor()
+#     sql = "select proxy from clb_proxy"
+#     cursor.execute(sql)
+#     proxy_lists = cursor.fetchall()
+#     ip_list = []
+#     for proxy_ in proxy_lists:
+#         ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+#     proxy_list = []
+#     for str_ip in ip_list:
+#         str_ip_list = str_ip.split('-')
+#         proxyMeta = "http://%(host)s:%(port)s" % {
+#             "host": str_ip_list[0],
+#             "port": str_ip_list[1],
+#         }
+#         proxy = {
+#             "HTTP": proxyMeta,
+#             "HTTPS": proxyMeta
+#         }
+#         proxy_list.append(proxy)
+#     return proxy_list
+def RequestUrl(url, payload, item_id, start_time):
+    # ip = get_proxy()[random.randint(0, 3)]
+    response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
+    response.encoding = response.apparent_encoding
+    # 检查响应状态码
+    if response.status_code == 200:
+        # 请求成功，处理响应数据
+        # print(response.text)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        pass
+    else:
+        # 请求失败，输出错误信息
+        print('请求失败:', response.status_code, response.text)
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(item_id, taskType, state, takeTime, url, '请求失败')
+        soup = ''
+    return soup
+def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
+                create_by, create_time, page_size):
+    sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
+    cursor_.execute(sel_sql, (item_id, year))
+    selects = cursor_.fetchone()
+    if selects:
+        print(f'{name_pdf},{year}已存在')
+    else:
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        values = (
+            year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
+            create_by,
+            create_time, page_size)
+        cursor_.execute(Upsql, values)  # 插入
+        cnx.commit()  # 提交
+        print("更新完成:{}".format(Upsql))
+# 采集信息
+def SpiderByZJH(url, payload, dic_info, num, start_time):
+    item_id = dic_info[2]
+    # years = dic_info['call_year']
+    short_name = dic_info[4]
+    soup = RequestUrl(url, payload, item_id, start_time)
+    if soup == '':
+        return
+    # 先获取页数
+    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
+    total = re.findall(r'\d+', page)[0]
+    r_page = int(total) % 15
+    if r_page == 0:
+        Maxpage = int(total) // 15
+    else:
+        Maxpage = int(total) // 15 + 1
+    # 首页和其他页不同，遍历 如果是首页 修改一下链接
+    for i in range(1, Maxpage + 1):
+        if i == 1:
+            href = url
+        else:
+            # http://eid.csrc.gov.cn/101811/index_3_f.html
+            href = url.split('index')[0] + f'index_{i}_f.html'
+        soup = RequestUrl(href, payload, item_id, start_time)
+        if soup == '':
+            continue
+        tr_list = soup.find('div', id='txt').find_all('tr')
+        for tr in tr_list[1:]:
+            td_list = tr.find_all('td')
+            pdf_url_info = td_list[2]
+            # print(pdf_url)
+            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
+            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
+            # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
+            # print(name)
+            report_type = td_list[4].text.strip()
+            # print(report_type)
+            if report_type == '年报':
+                if '摘要' in name_pdf:
+                    continue
+                # 年份还从pdf名称里抽取
+                try:
+                    year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
+                except Exception as e:
+                    pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
+                    year = int(pub_time) - 1
+                    year = str(year)
+                page_size = 0
+                sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s'''
+                cursor_.execute(sel_sql, (item_id, year))
+                selects = cursor_.fetchone()
+                if selects:
+                    print(f'com_name:{short_name}、{year}已存在')
+                    continue
+                else:
+                    # 类型为年报的话就解析该年报pdf，并入库
+                    for i in range(0, 3):
+                        try:
+                            resp_content = requests.request("GET", pdf_url).content
+                            # 获取pdf页数
+                            with fitz.open(stream=resp_content, filetype='pdf') as doc:
+                                page_size = doc.page_count
+                            break
+                        except Exception as e:
+                            print(e)
+                            time.sleep(3)
+                            continue
+                    if page_size < 1:
+                        # pdf解析失败
+                        print(f'==={short_name}、{year}===pdf解析失败')
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
+                        continue
+                    result = ''
+                    for i in range(0, 3):
+                        try:
+                            result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
+                            break
+                        except Exception as e:
+                            print(e)
+                            time.sleep(3)
+                            continue
+                    if result == '':
+                        e = '上传服务器失败'
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
+                        continue
+                    if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        type_id = '1'
+                        item_id = item_id
+                        group_name = 'group1'
+                        path = bytes.decode(result['Remote file_id']).replace('group1', '')
+                        full_path = bytes.decode(result['Remote file_id'])
+                        category = 'pdf'
+                        file_size = result['Uploaded size']
+                        order_by = num
+                        status = 1
+                        create_by = 'XueLingKun'
+                        create_time = time_now
+                        page_size = page_size
+                        try:
+                            tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
+                                        category, file_size, order_by, status, create_by, create_time, page_size)
+                            state = 1
+                            takeTime = baseCore.getTimeCost(start_time, time.time())
+                            baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
+                        except:
+                            e = '数据库传输失败'
+                            state = 0
+                            takeTime = baseCore.getTimeCost(start_time, time.time())
+                            baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
+                        num = num + 1
+                        time.sleep(2)
+                    else:
+                        e = '采集失败'
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
+                        continue
+            else:
+                    continue
+def getUrl(code, url_parms, Catagory2_parms):
+    # 深市
+    if code[0] == '2' or code[0] == '0' or code[0] == '3':
+        url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
+        Catagory2 = Catagory2_parms[1]
+        # 构建POST请求的参数，prodType --- 股票代码
+        payload2 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}',
+            'selBoardCode0': '',
+            'selBoardCode': ''
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload2
+        }
+    # 沪市
+    if code[0] == '9' or code[0] == '6':
+        url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
+        Catagory2 = Catagory2_parms[0]
+        payload1 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}',
+            'selCatagory3': '',
+            'selBoardCode0': '',
+            'selBoardCode': '',
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload1
+        }
+    # 北交所
+    if code[0] == '8' or code[0] == '4':
+        try:
+            url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
+        except:
+            return
+        Catagory2 = Catagory2_parms[2]
+        payload3 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}'
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload3
+        }
+    return dic_parms
+#state1
+if __name__ == '__main__':
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Content-Length': '380',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
+        'Host': 'eid.csrc.gov.cn',
+        'Origin': 'http://eid.csrc.gov.cn',
+        'Pragma': 'no-cache',
+        'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+    }
+    header = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
+        'Host': 'static.sse.com.cn',
+        'Pragma': 'no-cache',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+    # 读取数据库获取股票代码 简称 以及 社会信用代码
+    num = 1
+    while True:
+        start_time = time.time()
+        # 获取企业信息
+        # social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
+        # if social_code == '':
+        #     time.sleep(20)
+        #     continue
+        # 获取企业信息
+        query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%'   and  state1='1'  limit 1 "
+        # 兴业银行
+        # query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%' and col5='通威股份'"
+        cursor.execute(query)
+        row = cursor.fetchone()
+        if row:
+            pass
+        else:
+            print('没有数据了，结束脚本')
+            break
+        # tycid = row[14]
+        com_name = row[6]
+        social_code = row[4]
+        code = row[7]
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        #1表示拿到数据
+        updateBeginSql = f"update Tfbs_bak set state1='0' and date1='{time_now}' where col3='{social_code}' "
+        cursor.execute(updateBeginSql)
+        cnx.commit()
+        dic_info = baseCore.getInfomation(social_code)
+        # count = dic_info[15]
+        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
+        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
+        url_parms = ['101111', '101811', '102611']
+        Catagory2_parms = ['9604', '10058', '10162']
+        # 根据股票代码选链接
+        # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，4、8开头的为北京交易所
+        try:
+            code = dic_info[3]
+        except Exception as e:
+            print(e,social_code)
+            continue
+        dic_parms = getUrl(code, url_parms, Catagory2_parms)
+        SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
+        end_time = time.time()
+        print(f'{com_name} ---- 该企业耗时 ---- {end_time - start_time}')
+        # count += 1
+        runType = 'AnnualReportCount'
+        # baseCore.updateRun(social_code, runType, count)
+    cnx.close()
+    cursor_.close()
+    baseCore.close()
--- a/comData/annualReport_ZJH/证监会-年报.py
+++ b/comData/annualReport_ZJH/证监会-年报.py
@@ -123,6 +123,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
            report_type = td_list[4].text.strip()
            # print(report_type)
            if report_type == '年报':
+                if '摘要' in name_pdf:
+                    continue
                # 年份还从pdf名称里抽取
                try:
                    year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')

--- a/comData/noticeReport_ZJH/fbs_notice.py
+++ b/comData/noticeReport_ZJH/fbs_notice.py
+"""
+证监会公告采集，只能按照搜索企业来采，从上市库里拿企业数据，sys_enterprise_ipo_copy1
+craw_state:已采集过表示为True,未采集表示为0，拿取数据表示为ing，解析失败表示为400
+update_state：为1 表示需要更新，用来增量循环
+如何统计出来该报告采到了没有，dt_error库统计失败的信息
+"""
+import json
+import re
+import time
+import fitz
+import pymysql
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from datetime import datetime
+from base import BaseCore
+from fdfs_client.client import get_tracker_conf, Fdfs_client
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
+cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+# cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
+# cursor = cnx.cursor()
+cursor_ = cnx_.cursor()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+tracker_conf = get_tracker_conf('./client.conf')
+client = Fdfs_client(tracker_conf)
+taskType = '企业公告/证监会'
+def RequestUrl(url, payload, social_code,start_time):
+    # ip = get_proxy()[random.randint(0, 3)]
+    for m in range(0, 3):
+        try:
+            response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
+            response.encoding = response.apparent_encoding
+            break
+        except Exception as e:
+            log.error(f"request请求异常----{m}-----{e}")
+            pass
+    # 检查响应状态码
+    if response.status_code == 200:
+        # 请求成功，处理响应数据
+        # print(response.text)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        pass
+    else:
+        # 请求失败，输出错误信息
+        log.error('请求失败:', url)
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
+        soup = ''
+    return soup
+def getUrl(code, url_parms, Catagory2_parms):
+    # 深市
+    if code[0] == '2' or code[0] == '0' or code[0] == '3':
+        url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
+        Catagory2 = Catagory2_parms[1]
+        # 构建POST请求的参数，prodType --- 股票代码
+        payload2 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}',
+            'selBoardCode0': '',
+            'selBoardCode': ''
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload2
+        }
+    # 沪市
+    if code[0] == '9' or code[0] == '6':
+        url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
+        Catagory2 = Catagory2_parms[0]
+        payload1 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}',
+            'selCatagory3': '',
+            'selBoardCode0': '',
+            'selBoardCode': '',
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload1
+        }
+    # 北交所
+    if code[0] == '8' or code[0] == '4':
+        try:
+            url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
+        except:
+            return
+        Catagory2 = Catagory2_parms[2]
+        payload3 = {
+            'prodType': f'{code}',
+            'prodType2': '代码/简称/拼音缩写 ',
+            'keyWord': '',
+            'keyWord2': '关键字',
+            'startDate': '',
+            'startDate2': '请输入开始时间',
+            'endDate': '',
+            'endDate2': '请输入结束时间',
+            'selCatagory2': f'{Catagory2}'
+        }
+        dic_parms = {
+            'code': code,
+            'url': url,
+            'Catagory2': Catagory2,
+            'payload': payload3
+        }
+    return dic_parms
+def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type):
+    inster = False
+    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
+    cursor_.execute(sel_sql, (social_code, pdf_url))
+    selects = cursor_.fetchone()
+    if selects:
+        print(f'com_name:{short_name}、{pdf_url}已存在')
+        return inster
+    # 信息插入数据库
+    try:
+        insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        list_info = [
+            social_code,
+            name_pdf,
+            '',  # 摘要
+            '',  # 正文
+            pub_time,  # 发布时间
+            pdf_url,
+            '证监会',
+            report_type,
+            '1',
+            'zh'
+        ]
+        cursor_.execute(insert_sql, tuple(list_info))
+        cnx_.commit()
+        insert = True
+        return insert
+    except:
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
+        return insert
+def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
+    sel_sql = "select article_id from brpa_source_article where source_address = %s"
+    cursor_.execute(sel_sql, pdf_url)
+    row = cursor_.fetchone()
+    id = row[0]
+    # 先获取PDF链接下载pdf，在解析内容
+    try:
+        res = requests.get(pdf_url)
+        content = ''
+        # 读取文件内容，
+        with fitz.open(stream=res.content, filetype='pdf') as doc:
+            for page in doc.pages():
+                content += page.get_text()
+    except:
+        # print('解析失败')
+        dic_result = {
+            'success': 'false',
+            'message': 'PDF解析失败',
+            'code': '204',
+        }
+        print(dic_result)
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
+        return False
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': id,
+        'author': '',
+        'content': content,
+        'contentWithTag': '',
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': 'zh',
+        'origin': '证监会',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': pdf_url,  # 原文链接
+        'summary': '',
+        'title': pdf_name,
+        'type': 3,
+        'socialCreditCode': social_code,
+        'year': year
+    }
+    # print(dic_news)
+    # 将相应字段通过kafka传输保存
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+        kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        print(dic_result)
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
+        print(dic_result)
+        return False
+# 采集信息
+def SpiderByZJH(url, payload, dic_info, start_time):  # dic_info 数据库中获取到的基本信息
+    okCount = 0
+    errorCount = 0
+    social_code = dic_info[2]
+    short_name = dic_info[4]
+    soup = RequestUrl(url, payload, social_code, start_time)
+    if soup == '':
+        return False
+    # 先获取页数
+    try:
+        page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
+    except:
+        e = f"该企业没有{dic_parms['Catagory2']}数据"
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, dic_parms['url'], 'Kafka操作失败')
+        return False
+    total = re.findall(r'\d+', page)[0]
+    r_page = int(total) % 15
+    if r_page == 0:
+        Maxpage = int(total) // 15
+    else:
+        Maxpage = int(total) // 15 + 1
+    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
+    # 首页和其他页不同，遍历 如果是首页 修改一下链接
+    for i in range(1, Maxpage + 1):
+        log.info(f'==========正在采集第{i}页=========')
+        if i == 1:
+            href = url
+        else:
+            # http://eid.csrc.gov.cn/101811/index_3_f.html
+            href = url.split('index')[0] + f'index_{i}_f.html'
+        soup = RequestUrl(href, payload, social_code, start_time)
+        if soup == '':
+            continue
+        tr_list = soup.find('div', id='txt').find_all('tr')
+        pageIndex = 0
+        for tr in tr_list[1:]:
+            pageIndex += 1
+            td_list = tr.find_all('td')
+            pdf_url_info = td_list[2]
+            # print(pdf_url)
+            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
+            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
+            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
+            year = pub_time[:4]
+            report_type = td_list[4].text.strip()
+            # 信息插入数据库
+            insert = InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type)
+            log.info(f'======={short_name}========{code}===插入公告库成功')
+            if insert:
+                #     # 公告信息列表
+                #     okCount = okCount + 1
+                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
+                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time)
+                if result:
+                    # 公告信息列表
+                    okCount = okCount + 1
+                    log.info(f'{short_name}==============解析传输操作成功')
+                    state = 1
+                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
+                    pass
+                else:
+                    errorCount += 1
+                    # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    log.error(f'{short_name}=============解析或传输操作失败')
+                    # try:
+                    #     insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
+                    #     cursor_.execute(insert_err_sql)
+                    #     cnx_.commit()
+                    # except:
+                    #     pass
+                    continue
+    return True
+#state2
+if __name__ == '__main__':
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Content-Length': '380',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
+        'Host': 'eid.csrc.gov.cn',
+        'Origin': 'http://eid.csrc.gov.cn',
+        'Pragma': 'no-cache',
+        'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+    }
+    header = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
+        'Host': 'static.sse.com.cn',
+        'Pragma': 'no-cache',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+    # dic_parms = {}
+    # 读取数据库获取股票代码 简称 以及 社会信用代码
+    while True:
+        start_time = time.time()
+        # 获取企业信息
+        # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
+        # 判断 如果Redis中已经没有数据，则等待
+        # if social_code == None:
+        #     time.sleep(20)
+        #     continue
+        # 获取企业信息
+        # query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%'   and  state2 is Null  limit 1 "
+        # 兴业银行
+        query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%' and col5='兴业银行'"
+        cursor.execute(query)
+        row = cursor.fetchone()
+        if row:
+            pass
+        else:
+            print('没有数据了，结束脚本')
+            break
+        # tycid = row[14]
+        com_name = row[6]
+        social_code = row[4]
+        code = row[7]
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        # 1表示拿到数据
+        updateBeginSql = f"update Tfbs_bak set state2='1',date1='{time_now}' where col3='{social_code}' "
+        cursor.execute(updateBeginSql)
+        cnx.commit()
+        dic_info = baseCore.getInfomation(social_code)
+        count = dic_info[16]
+        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
+        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
+        # 发行上市公告,北交所没有该栏目
+        url_parms = ['101110', '101810']
+        Catagory2_parms = ['9603', '10057']
+        # 临时报告
+        url_parms_ls = ['101112', '101812', '102612']
+        Catagory2_parms_ls = ['9605', '10059', '10163']
+        # 根据股票代码选链接
+        # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，4、8开头的为北京交易所
+        code = dic_info[3]
+        short_name = dic_info[4]
+        dic_parms = getUrl(code, url_parms, Catagory2_parms)
+        dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
+        if len(dic_parms) > 0:
+            start_time_cj = time.time()
+            result = SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time)
+            if result:
+                log.info(f'{code}==========={short_name},发行公告成功,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
+            else:
+                log.info(f'{code}==========={short_name},发行公告失败,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
+            start_time_ls = time.time()
+            result_ls = SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time)
+            if result_ls:
+                log.info(f'{code}==========={short_name},临时报告成功,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
+            else:
+                log.info(f'{code}==========={short_name},临时报告失败,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
+            # UpdateInfoSql(retData,retData_ls,social_code)
+            # log.info(f'{code}================更新成功')
+            end_time = time.time()
+            log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
+            count += 1
+            # runType = 'NoticeReportCount'
+            # baseCore.updateRun(code, runType, count)
+    cursor.close()
+    cnx.close()
+    cursor_.close()
+    cnx_.close()
+    # 释放资源
+    baseCore.close()
--- a/comData/tcyQydt/fbs_tyc_qydt.py
+++ b/comData/tcyQydt/fbs_tyc_qydt.py
+"""
+    增量采集：
+        取state为3、update_state为空的企业 表示上次采集成功的企业，
+        新增update_state字段，取一个企业更新为2，表示该企业正在采集。
+        采集完毕更新为1.
+    表示已经采集完成。跟据date_time 来排列 每次就不会拿到重复的数据。
+    okCount
+    errorCount
+    repectCount
+    新增三个字段分别对应更新的up_okCount up_errorCount up_repectCount ，
+    记录这些更新的数据 然后加到原来的数据上表示该企业已采集多少动态
+    8.8日改版，企业动态也传kafka
+"""
+import json
+import requests,time,pymysql
+import jieba
+import sys
+from kafka import KafkaProducer
+from base.BaseCore import BaseCore
+from base.smart import smart_extractor
+# sys.path.append('D:/KK/zzsn_spider/base')
+# import BaseCore
+# from smart import smart_extractor
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+jieba.cut("必须加载jieba")
+# 初始化，设置中文分词
+smart =smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+cursor= cnx.cursor()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+pageSize = 10
+headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=77e997401d5f11ee9e91d5a0fd3c0b89; ssuid=6450041974; _ga=GA1.2.858826166.1688800641; _gid=GA1.2.2142449376.1689575510; tyc-user-info-save-time=1689764135027; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22309757777%22%2C%22first_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzQ1Y2IxMDI1N2QtMGNmZWUwNTMyN2Y2NzMtMjYwMzFkNTEtMTMyNzEwNC0xODkzNDVjYjEwMzc1YiIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwOTc1Nzc3NyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22309757777%22%7D%2C%22%24device_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%7D; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1689752829,1689821665,1689831487,1689845884; searchSessionId=1689845917.81838207; HWWAFSESID=146bb1d25b1515339d3; HWWAFSESTIME=1689858023324; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1689859758',
+        'Host': 'capi.tianyancha.com',
+        'Origin': 'https://www.tianyancha.com',
+        'Referer': 'https://www.tianyancha.com/',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
+}
+taskType = '企业动态/天眼查'
+def beinWork(tyc_code, social_code):
+    start_time = time.time()
+    time.sleep(3)
+    # retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
+    retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
+    t = time.time()
+    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
+    for m in range(0, 3):
+        try:
+            ip = baseCore.get_proxy()
+            headers['User-Agent'] = baseCore.getRandomUserAgent()
+            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
+            # time.sleep(random.randint(3, 5))
+            break
+        except Exception as e:
+            pass
+    if (response.status_code == 200):
+        pass
+    else:
+        log.error(f"{tyc_code}-----获取总数接口失败")
+        e = '获取总数接口失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
+        return retData
+    try:
+        json_1 = json.loads(response.content.decode('utf-8'))
+        total = json_1['data']['total']
+    except:
+        log.error(f"{tyc_code}-----获取总数失败")
+        e = '获取总数失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
+        return retData
+    if (total > 0):
+        if (total % pageSize == 0):
+            totalPage = total // pageSize
+        else:
+            totalPage = total // pageSize + 1
+    else:
+        log.error(f"{tyc_code}--------总数为0")
+        retData['state'] = True
+        return retData
+    log.info(f"{tyc_code}-------总数：{total}----总页数:{totalPage}")
+    retData['total'] = total
+    up_okCount = 0
+    up_errorCount = 0
+    up_repetCount = 0
+    for num in range(1, totalPage + 1):
+        time.sleep(3)
+        log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
+        start_page = time.time()
+        url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
+        for m in range(0, 3):
+            try:
+                ip = baseCore.get_proxy()
+                headers['User-Agent'] = baseCore.getRandomUserAgent()
+                response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
+                # time.sleep(3)
+                break
+            except:
+                pass
+        if (response_page.status_code == 200):
+            pass
+        else:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            e = '获取分页数据失败'
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
+            up_errorCount = up_errorCount + pageSize
+            continue
+        try:
+            json_page = json.loads(response_page.content.decode('utf-8'))
+            info_list_page = json_page['data']['items']
+        except:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            e = '获取分页数据失败'
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
+            up_errorCount = up_errorCount + pageSize
+            continue
+        pageIndex = 0
+        for info_page in info_list_page:
+            pageIndex = pageIndex + 1
+            title = info_page['title']
+            source = info_page['website']
+            link = info_page['uri']
+            try:
+                sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
+                cursor.execute(sel_sql, (link, social_code))
+            except Exception as e:
+                print(e)
+            selects = cursor.fetchone()
+            if selects:
+                log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
+                # todo:如果该条数据存在则说明该条数据之后的都已经采集完成，就可以跳出函数，执行下一个企业
+                retData['up_okCount'] = up_okCount
+                retData['up_errorCount'] = up_errorCount
+                retData['up_repetCount'] = up_repetCount
+                return retData
+            try:
+                time_struct = time.localtime(int(info_page['rtm'] / 1000))  # 首先把时间戳转换为结构化时间
+                time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct)  # 把结构化时间转换为格式化时间
+            except:
+                time_format = baseCore.getNowTime(1)
+            try:
+                # 开始进行智能解析
+                lang = baseCore.detect_language(title)
+                smart = smart_extractor.SmartExtractor(lang)
+                contentText = smart.extract_by_url(link).text
+                # time.sleep(3)
+            except Exception as e:
+                contentText = ''
+            if contentText == '':
+                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
+                e = '获取正文失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                up_errorCount = up_errorCount + 1
+                try:
+                    insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
+                    cursor.execute(insert_err_sql)
+                    cnx.commit()
+                except:
+                    pass
+                continue
+            try:
+                insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+                # 动态信息列表
+                up_okCount = up_okCount + 1
+                list_info = [
+                    social_code,
+                    title,
+                    info_page['abstracts'],  # 摘要
+                    contentText,  # 正文
+                    time_format,  # 发布时间
+                    link,
+                    '天眼查',
+                    source,
+                    '2',
+                    'zh'
+                ]
+                cursor.execute(insert_sql, tuple(list_info))
+                cnx.commit()
+                # 采集一条资讯记录一条，记录该企业采到了多少的资讯
+                log.info(f'{social_code}----{link}:新增一条')
+                sel_sql = "select article_id from brpa_source_article where source_address = %s and social_credit_code = %s"
+                cursor.execute(sel_sql, (link, social_code))
+                row = cursor.fetchone()
+                id = row[0]
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:插入一条数据，并传入kafka
+                dic_news = {
+                    'attachmentIds': id,
+                    'author': '',
+                    'content': contentText,
+                    'contentWithTag': contentText,
+                    'createDate': time_now,
+                    'deleteFlag': '0',
+                    'id': '',
+                    'keyWords': '',
+                    'lang': 'zh',
+                    'origin': '天眼查',
+                    'publishDate': time_format,
+                    'sid': '1684032033495392257',
+                    'sourceAddress': link,  # 原文链接
+                    'summary': info_page['abstracts'],
+                    'title': contentText,
+                    'type': 2,
+                    'socialCreditCode': social_code,
+                    'year': time_format[:4]
+                }
+            except Exception as e:
+                log.info(f'传输失败:{social_code}----{link}')
+                e = '数据库传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                continue
+                # print(dic_news)
+                # 将相应字段通过kafka传输保存
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                kafka_result = producer.send("researchReportTopic",
+                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+                print(kafka_result.get(timeout=10))
+                dic_result = {
+                    'success': 'ture',
+                    'message': '操作成功',
+                    'code': '200',
+                }
+                log.info(dic_result)
+                # 传输成功,写入日志中
+                state = 1
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
+                # return True
+            except Exception as e:
+                dic_result = {
+                    'success': 'false',
+                    'message': '操作失败',
+                    'code': '204',
+                    'e': e
+                }
+                log.error(dic_result)
+                e = 'Kafka操作失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+        log.info(f"获取分页数据--{tyc_code}----分页{num}，耗时{baseCore.getTimeCost(start_page, time.time())}")
+    retData['up_okCount'] = up_okCount
+    retData['up_errorCount'] = up_errorCount
+    retData['up_repetCount'] = up_repetCount
+    return retData
+def doJob():
+    while True:
+        # 获取企业信息
+        query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col6 not like '%HK%' and col3 not like 'ZZSN%'   and  state3 is null  limit 1 "
+        # 兴业银行
+        # query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3  and col3 not like 'ZZSN%' and col5='兴业银行'"
+        cursor_.execute(query)
+        row = cursor_.fetchone()
+        if row:
+            pass
+        else:
+            print('没有数据了，结束脚本')
+            break
+        tycid = row[16]
+        com_name = row[6]
+        xydm = row[4]
+        code = row[7]
+        count = 0
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        #0 表示拿取数据
+        updateBeginSql = f"update Tfbs_bak set state3='0',date3='{time_now}' where col3='{xydm}' "
+        # print(updateBeginSql)
+        cursor_.execute(updateBeginSql)
+        cnx_.commit()
+        log.info(f"{id}---{xydm}----{tycid}----开始处理")
+        start_time = time.time()
+        # 开始采集企业动态
+        retData = beinWork(tycid, xydm)
+        # 信息采集完成后将该企业的采集次数更新
+        runType = 'NewsRunCount'
+        count += 1
+        # baseCore.updateRun(xydm, runType, count)
+        total = retData['total']
+        up_okCount = retData['up_okCount']
+        up_errorCount = retData['up_errorCount']
+        up_repetCount = retData['up_repetCount']
+        log.info(
+            f"{id}---{xydm}----{tycid}----结束处理，耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
+        # 200 表示成功
+        updateBeginSql = f"update Tfbs_bak set state3='200',date3='{time_now}' where col3='{xydm}' "
+        # print(updateBeginSql)
+        cursor_.execute(updateBeginSql)
+        cnx_.commit()
+    cursor.close()
+    cnx.close()
+    # 释放资源
+    baseCore.close()
+# Press the green button in the gutter to run the script.
+#state3
+if __name__ == '__main__':
+    doJob()
--- a/comData/tcyQydt/test.py
+++ b/comData/tcyQydt/test.py
@@ -5,26 +5,19 @@ import langid
 from base.BaseCore import BaseCore
 baseCore =BaseCore()
+import pymysql
 # print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
+# cnx_ = baseCore.cnx
+# cursor_ = baseCore.cursor
+cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
+                                   charset='utf8mb4')
+cursor_ = cnx_.cursor()
+updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
+# print(updateBeginSql)
+cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
+cnx_.commit()
-#
-# def detect_language(text):
-#     # 使用langid.py判断文本的语言
-#     lang, confidence = langid.classify(text)
-#     print(lang,confidence)
-#     return lang
-# detect_language("123")
-from textblob import TextBlob
-def detect_language(text):
-    blob = TextBlob(text)
-    lang = blob.detect_language()
-    return lang
-text = "Hello, how are you?"
-language = detect_language(text)
-print(language)
--- a/comData/weixin_solo/oneWeixin.py
+++ b/comData/weixin_solo/oneWeixin.py
 '''
-补充智库动态没有公众号信息数据的公众号
+记录一天能采多少公众号
-从库中读取信息，根据域名找到属于公众号的链接，
-设置time.sleep 等待到每天执行
 '''
+import requests, time, random, json, pymysql, redis
-import requests, time, re, datetime, random, json, pymysql, redis
 import pandas as pd
 import urllib3
 from bs4 import BeautifulSoup
@@ -216,7 +213,7 @@ if __name__=="__main__":
    # browser2.get(url)
    # browser3.get(url)
    # 可改动
-    time.sleep(50)
+    time.sleep(30)
    num_b = 0
    browser_run = list_b[0]
    log.info('======刷新浏览器=====')
@@ -313,13 +310,13 @@ if __name__=="__main__":
            count = 0
            try:
                ip = get_proxy()[random.randint(0, 3)]
-                json_search = s.get(url_search, headers=baseCore.getRandomUserAgent(), proxies=ip,
+                json_search = s.get(url_search, headers=headers, proxies=ip,
                                    verify=False).json()  # , proxies=ip, verify=False
                time.sleep(2)
                break
            except:
                log.info(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}===')
-                error_text = str(json_search)
+                # error_text = str(json_search)
                json_search = ''
                aa = time.sleep(600)
                log.info(f'======等待时间{aa}=======')