5/10

e694b41b · XveLingKun · 6da55a3e · e694b41b · e694b41b · e694b41b
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -891,6 +891,7 @@ class BaseCore:
                page_size = doc.page_count
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            log = self.getLogger()
            log.error(f'文件损坏')

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -324,7 +324,7 @@ def AnnualEnterprise():
    gn_social_list = [item[0] for item in gn_result]
    print('=======')
    for item in gn_social_list:
-        r.rpush('AnnualEnterprise:gnqy_socialCode', item)
+        r.rpush('AnnualEnterprise:zjh_socialCode', item)
    closeSql(cnx,cursor)
 #企业年报定时任务
@@ -514,7 +514,7 @@ def NQEnterprise():
    for item in nq_social_list:
        #新三板企业财务数据 上市信息 核心人员已采集  企业动态、企业公告未采集 企业公告脚本已开发，企业动态需要每天放入redis
        # r.rpush('NQEnterprise:nq_Ipo', item)
-        r.rpush('NQEnterprise:nq_finance',item)
+        r.rpush('NQEnterprise:nq_finance',  item)
        # r.rpush('NQEnterprise:nq_notice',item)
    closeSql(cnx_, cursor_)
@@ -674,10 +674,11 @@ if __name__ == "__main__":
    # BaseInfoEnterprise()
    # FBS()
    # MengZhi()
-    NQEnterprise()
+    # NQEnterprise()
    # SEC_CIK()
    # dujioashou()
    # omeng()
+    AnnualEnterprise()
    # AnnualEnterpriseUS()
    # NoticeEnterprise_task()
    # AnnualEnterprise_task()

--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
@@ -88,6 +88,12 @@ def doJob():
            'version': 'TYC-Web'
        }
        cookies_list, id_cookie, user_name = token.get_cookies()
+        if cookies_list:
+            pass
+        else:
+            log.info("没有账号了,等待30分钟")
+            time.sleep(30 * 60)
+            return '', '', ''
        log.info(f'=====当前使用的是{user_name}的cookie======')
        cookies = {}
        for cookie in cookies_list:
@@ -97,7 +103,7 @@ def doJob():
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '91110108780992804C'
+        social_code = '91370212MA3MJMA0XW'
        if social_code == None:
            time.sleep(20)
            continue

--- a/comData/Tyc/get_tyc_cookies.py
+++ b/comData/Tyc/get_tyc_cookies.py
@@ -26,7 +26,7 @@ if __name__ == "__main__":
    name = input('所属用户:')
    driver = create_driver()
    driver.get(url)
-    time.sleep(80)
+    time.sleep(60)
    cookies = driver.get_cookies()
    # print(driver.get_cookies())

--- a/comData/policylaw/gwyRelevantDocuments.bat
+++ b/comData/policylaw/gwyRelevantDocuments.bat
+title gwyRelevantDocuments
+chcp 65001
+cd /d %~dp0
+python38 gwyRelevantDocuments.py
\ No newline at end of file
--- a/comData/policylaw/gwyfile.bat
+++ b/comData/policylaw/gwyfile.bat
+title gwyfile
+chcp 65001
+cd /d %~dp0
+python38 gwyfile.py
\ No newline at end of file
--- a/comData/policylaw/gwyparts.bat
+++ b/comData/policylaw/gwyparts.bat
+title gwyparts
+chcp 65001
+cd /d %~dp0
+python38 gwyparts.py
--- a/comData/weixin_solo/get_fail_link.py
+++ b/comData/weixin_solo/get_fail_link.py
+import redis
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
+def query():
+    sql = "select id from wx_link where state = '300' order by publish_time desc"
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    return result
+if __name__ == "__main__":
+    result = query()
+    # result = ['1990264','1990265','1998085','1998086','2039312','2067942','2087699','2087700','2087701','2087774','2087775','2087776','2087777','2088091','2088092','2088093','2088445','2088446','2088447','2088455','2121977','2385237','2385238','2386227','2678376','2678377','2678421','2678422','2678425','2731944','2731945','2731946','2732184','2732185','2732205','2732206','2732313','2732314','2732317','2732318','2732319','2732320','2732321','2732323','2732438','2732439','2732440','2732453','2732455','2732456','2732483','2732497','2732910','2732911','2732912','2732913','2732915','2732918','2732952','2732953','2732958','2732959','2732960','2733052','2733053','2733097','2733100','2733101','2733120','2733121','2733123','2733124','2733127','2733128','2733130','2733146','2733147','2733148','2733149','2733150','2733151','2733152','2733153','2733154','2733155','2733156','2733157','2733328','2733345','2733346','2733515','2733518','2733519','2733534','2733536','2733537','2733565','2733566','2733595','2733596','2733598','2733627','2733703','2733705','2733706','2733814','2733958','2733959','2733960','2734035','2734062','2734113','2734180','2734182','2734270','2734271','2734272','2734273','2734274','2734275','2734276','2734307','2734308','2734311','2734312','2734313','2734314','2734315','2734316','2734317','2734324','2734325','2734326','2734328','2734329','2734330','2734339','2734340','2734341','2734388','2734389','2734536','2734537','2734538','2735181','2735182','2735183','2735184','2735185','2735186','2735187','2735188','2735190','2735191','2735194','2735196','2735266','2735267','2735268','2735269','2735270','2735271','2735272','2735276','2735277','2735278','2735279','2735280','2735281','2735282','2735283','2735297','2735561','2735625','2735627','2735628','2735662','2735663','2736211','2736212','2736213','2736214','2736215','2736216','2736544','2736545','2736546','2736559','2736677','2736678','2736817','2736819','2736820','2736821','2736823','2736824','2736825','2736828','2736905','2736906','2736907','2736912','2736913','2736914','2736915','2736916','2736917','2736963','2736964','2736988','2736989','2736990','2736991','2737108','2737111','2737600','2737601','2737604','2737701','2737702','2737703','2737759','2737928','2737930','2737931','2738046','2738050','2738051','2738052','2738053','2738356','2738357','2738358','2738456','2738460','2738461','2738485','2738486','2738607','2738608','2738609','2739613','2739614','2739615','2739649','2739650','2739651','2739908','2739909','2739910','2739911','2739912','2739913','2740019','2740022','2740023','2740123','2740207','2740208','2740209','2740252','2740255','2740256','2740269','2740270','2740271','2740412','2740413','2740485','2740486','2740487','2740535','2740536','2740537','2740538','2740539','2740540','2740541','2740542','2740543','2740544','2740545','2740546','2740547','2740548','2740549','2740659','2740660','2740661','2740662','2740663','2740664','2740924','2740926','2740927','2740964','2740965','2740966','2741091','2741092','2741093','2741098','2741099','2741100','2741129','2741130','2741131','2744702','2744703','2744704','2744705','2759363','2759364','2759365','2759546','2759547','2759548','2759549','2759550','2759551','2759552','2759553','2759554','2759555','2759556','2759800','2759801','2759802','2759803','2759805','2759806','2759829','2760062','2760063','2760064','2760729','2760730','2760733','2760899','2760900','2760902','2760903','2760904','2760905','2761327','2761328','2761332','2761783','2761784','2761785','2761795','2761797','2761799','2761805','2761819','2761820','2761821','2761822','2761891','2761892','2761893','2761894','2761895','2761896','2761897','2761900','2762070','2762071','2762072','2762073','2762074','2762075','2762076','2762077','2762078','2762079','2762080','2762081','2762082','2762083','2762084','2762085','2762087','2762088','2762089','2762090','2762091','2762092','2762093','2762125','2762126','2762127','2762137','2762138','2762139','2762160','2762161','2762162','2762195','2762196','2762197','2762410','2762411','2762419','2762470','2762471','2762472','2762873','2762875','2762877','2762930','2762931','2762937','2762938','2762939','2762940','2762941','2763276','2763277','2763278','2763304','2763305','2763307','2763308','2763309','2763310','2763312','2763697','2763698','2763699','2763700','2763701','2763702','2763703','2764035','2764036','2764037','2764039','2764040','2764041','2764042','2764043','2764044','2764045','2764046','2764047','2764048','2764049','2764050','2764051','2764055','2764056','2764057','2764059','2764060','2764062','2764063','2764064','2764065','2764164','2764165','2764369','2764370','2764567','2764568','2764570','2764618','2764619','2764620','2764744','2764745','2764748','2764770','2764771','2764772','2764869','2764870','2764871','2764898','2764899','2764900','2764901','2764902','2764903','2764904','2764905','2764906','2764907','2764908','2764909','2764910','2764911','2764912','2764913','2764914','2764915','2764916','2764917','2764918','2764933','2764934','2764935','2764936','2764937','2764938','2764939','2764957','2764958','2764959','2764960','2764961','2764963','2764964','2764965','2764966','2765020','2765021','2765022','2765023','2765024','2765026','2765229','2765230','2765231','2765232','2765233','2765293','2765294','2765295','2765296','2765297','2765298','2765299','2765300','2765301','2765302','2765303','2765304','2765305','2765306','2765307','2765308','2765414','2765416','2765424','2765571','2765572','2765573','2765796','2765797','2765798','2765804','2765805','2765807','2765808','2765809','2765810','2765811','2765812','2765813','2765814','2765815','2765816','2765820','2765821','2765822','2766021','2766022','2766023','2766024','2766025','2766048','2766060','2766061','2766062','2766063','2766064','2766066','2766068','2766069','2766071','2766072','2766073','2766074','2766075','2766169','2766194','2766195','2766197','2766208','2766209','2766244','2766245','2766246','2766536','2766537','2766538','2766539','2766540','2766547','2766669','2766670','2766671','2766673','2766674','2766675','2766676','2766677','2766678','2766679','2766680','2766681','2766682','2766790','2766792','2766826','2766827','2767032','2767120','2767121','2767122','2767123','2767126','2767127','2767128','2767129','2767130','2767131','2767132','2767133','2767134','2767135','2767136','2767137','2767138','2767139','2767173','2767174','2767408','2767409','2767410','2767411','2767502','2767503','2767534','2767535','2767545','2767546','2767547','2767548','2767600','2767602','2767642','2767643','2767655','2767656','2767717','2767718','2767719','2767720','2767732','2767740','2767741','2767756','2767758','2767766','2767767','2767807','2767808','2767809','2767810','2767817','2767818','2767825','2767827','2767828','2767829','2767840','2767887','2767898','2767899','2767900','2767901','2767902','2767903','2767906','2767907','2767908','2767955','2768155','2768156','2768166','2768167','2768168','2768170','2768284','2768360','2768368','2768378','2768826','2768827','2768845','2768846','2768847','2768848','2768849','2768850','2768851','2768852','2768871','2768872','2768877','2768878','2768879','2768880','2768888','2768912','2768913','2768914','2768916','2768917','2768918','2768919','2768920','2768921','2768922','2768923','2768924','2769071','2769075','2769240','2769258','2769270','2769271','2769272','2769399','2769400','2769427','2769428','2769429','2769430','2769489','2769491','2769542','2769543','2769568','2769569','2770721','2770722','2770723','2770724','2770725','2770726','2770728','2770729','2770730','2770731','2770732','2770733','2770734','2770738','2770739','2770793','2770796','2770903','2770904','2770906','2770907','2770909','2770977','2770978','2771278','2771280','2771661','2771662','2771929','2771932','2772086','2772087','2772088','2772089','2772090','2772130','2772132','2772319','2772320','2772409','2772410','2772423','2772424','2772425','2772426','2772630','2772632','2772721','2772723','2772724','2772725','2772737','2772738','2772749','2772750','2772751','2772752','2772753','2773224','2773227','2773253','2773254','2773287','2773289','2773346','2773347','2773348','2773349','2773350','2773351','2773385','2773386','2773387','2773388','2773558','2773559','2773563','2773564','2773565','2773566','2773567','2773568','2773777','2773778','2773805','2773806','2773808','2773834','2773836','2773837','2773838','2773839','2773841','2774222','2774223','2774302','2774303','2774319','2774320','2774334','2774335','2774440','2774512','2774513','2774527','2774528','2774529','2774531','2774660','2774807','2774830','2774847','2775022','2775023','2775164','2775165','2775350','2775351','2775386','2775387','2775482','2775483','2775677','2775680','2776678','2776681','2776869','2776872','2776873','2776881','2776882','2776883','2776884','2776885','2776993','2776994','2776996','2776997','2777106','2777115','2777123','2777124','2777125','2777126','2777127','2777128','2777129','2777130','2777131','2777132','2777237','2777240','2777814','2777815','2777817','2777819','2777820','2777821','2777822','2777823','2777824','2777825','2777826','2777827','2777828','2777829','2777830','2778099','2778100','2778267','2778268','2778484','2778486','2778601','2778603','2778707','2778708','2779029','2779030','2779031','2779032','2779033','2779034','2779035','2779047','2779048','2779253','2779369','2779371','2779528','2779529','2779530','2779533','2779591','2779592','2779790','2779791','2780196','2780501','2780504']
+    for id in result:
+        # 放入redis
+        r.rpush("WeiXinGZH:linkid_fail", id[0])
+        # r.rpush("WeiXinGZH:linkid_fail", id)
\ No newline at end of file
--- a/comData/weixin_solo/get_fail_list.py
+++ b/comData/weixin_solo/get_fail_list.py
+"""获取每天失败的列表--返回给数据组"""
+import datetime
+import time
+import pandas as pd
+import smtplib
+from email.header import Header
+from email.mime.application import MIMEApplication
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+import pymysql
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# cnx = baseCore.cnx
+# cursor = baseCore.cursor
+def sendEmail(file_name, receiver, filename):
+    file = open(file_name, 'rb').read()
+    # 发送邮箱地址
+    sender = '1195236739@qq.com'
+    # 发送邮箱登录 账户 密码
+    username = '1195236739@qq.com'
+    password = 'gatvszshadvpgjci'
+    smtpserver = 'smtp.qq.com'
+    # # 接收邮箱地址
+    # receiver = 'fujunxue@ciglobal.cn'
+    maile_title = filename
+    message = MIMEMultipart()
+    message['From'] = sender
+    message['To'] = receiver
+    message['Subject'] = Header(maile_title, 'utf-8')
+    message.attach(MIMEText(filename, 'plain', 'utf-8'))
+    xlsxApart = MIMEApplication(file)
+    xlsxApart.add_header('Content-Disposition', 'attachment', filename=filename)
+    message.attach(xlsxApart)
+    smtpObj = smtplib.SMTP_SSL(smtpserver)  # 注意：如果遇到发送失败的情况（提示远程主机拒接连接），这里要使用SMTP_SSL方法
+    smtpObj.connect(smtpserver, port=465)
+    smtpObj.login(username, password)
+    smtpObj.sendmail(sender, receiver, message.as_string())
+    print("邮件发送成功！！！")
+    smtpObj.quit()
+# 解析失败
+def get_failed_list(today_time):
+    cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                          charset='utf8mb4')
+    cursor = cnx.cursor()
+    sql = f"select * from wx_link where (state = '300' or state = '600' and state = '200') and create_time >= '{today_time}' "
+    # sql = f"select * from wx_link where state='800' "
+    print(sql)
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    cursor.close()
+    cnx.close()
+    return result
+# 发布内容不存在
+def get_null_list(today_time):
+    cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                          charset='utf8mb4')
+    cursor = cnx.cursor()
+    sql = f"select * from wx_link where (state='800' or state='500') and create_time >= '{today_time}' "
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    cursor.close()
+    cnx.close()
+    return result
+def get_info(result):
+    fail_list = []
+    for info in result:
+        site_name = info[3]  # 公众号
+        info_source_code = info[4]
+        title = info[5]
+        publish_time = info[6]
+        link = info[7]
+        # 写入detaframe
+        # 创建一个字典，其中包含当前行的数据
+        row = {
+            '公众号': site_name,
+            '公众号编码': info_source_code,
+            '标题': title,
+            '发布时间': publish_time,
+            '链接': link
+        }
+        fail_list.append(row)
+    return fail_list
+if __name__ == "__main__":
+    # 创建一个空的DataFrame，其中包含你需要的列名
+    while True:
+        # 计算今天的时间
+        now = datetime.datetime.now()
+        print(now)
+        time.sleep(1)
+        print(datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0))
+        if now >= datetime.datetime.now().replace(hour=23, minute=59, second=59, microsecond=0):
+            pass
+        else:
+            continue
+        today_time = str(now.strftime("%Y-%m-%d 00:00:00"))
+        print(today_time)
+        result = get_failed_list(today_time)
+        result_null = get_null_list(today_time)
+        if result:
+            fail_list = get_info(result)
+            result_df = pd.DataFrame(fail_list)
+            result_df.to_excel(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", index=False)
+            sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号采集失败列表")
+            sendEmail(f"./database/{today_time[:10]}_微信公众号采集失败列表.xlsx", "mr@ciglobal.cn", "微信公众号采集失败列表")
+        else:
+            log.info(f'{today_time} 没有采集失败的文章')
+        if result_null:
+            fail_list = get_info(result)
+            null_list = get_info(result_null)
+            null_df = pd.DataFrame(fail_list)
+            null_df.to_excel(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", index=False)
+            sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "fujunxue@ciglobal.cn", "微信公众号文章内容为空列表")
+            sendEmail(f"./database/{today_time[:10]}_微信公众号文章内容为空列表.xlsx", "mr@ciglobal.cn", "微信公众号文章内容为空列表")
+        else:
+            log.info(f'{today_time} 没有采集到空的文章')
--- a/comData/weixin_solo/oneWeixin2.py
+++ b/comData/weixin_solo/oneWeixin2.py
 # -*- coding: utf-8 -*-
 '''
-成功100 发送数据失败200  请求失败400  文章内容为空500  处理style标签失败700
+成功100 发送数据失败200  请求失败400  文章内容为空500  处理style标签失败700  发布内容不存在800 图片处理失败300、600
 '''
 import re
@@ -118,6 +118,10 @@ def get_info(dict_json, linkid):
        #     updatewxLink(url_news, info_source_code, 400)
        return False
    soup_news = BeautifulSoup(res_news.content, 'html.parser')
+    if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
+        log.info(f'--errorCode:800--{origin}---{news_date}---{news_title}----内容无法查看')
+        updatewxLink(url_news, info_source_code, 800)
+        return False
    try:
        news_html = soup_news.find('div', {'id': 'js_content'})
        news_html['style'] = 'width: 814px ; margin: 0 auto;'
@@ -228,7 +232,7 @@ def get_info(dict_json, linkid):
    }
    for nnn in range(0, 3):
        try:
-            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 7, 0))
            kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
            kafka_time_out = kafka_result.get(timeout=10)
            # add_url(sid, url_news)
@@ -252,7 +256,7 @@ def get_info(dict_json, linkid):
    }
    for nnn2 in range(0, 3):
        try:
-            producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+            producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2,7,0))
            kafka_result2 = producer2.send("collectionAndDispatcherInfo",
                                           json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
            break

--- a/comData/weixin_solo/test1.py
+++ b/comData/weixin_solo/test1.py
-import time
+import re
-import pandas as pd
+import requests
+from bs4 import BeautifulSoup
-# def writeaa():
+from retry import retry
-#     detailList=[]
-#     aa={
-#         'id':3,
-#         'name':'qqqwe'
-#     }
-#     detailList.append(aa)
-#     writerToExcel(detailList)
-# 将数据追加到excel
-# def writerToExcel(detailList):
-#     # filename='baidu搜索.xlsx'
-#     # 读取已存在的xlsx文件
-#     existing_data = pd.read_excel(filename,engine='openpyxl')
-#     # 创建新的数据
-#     new_data = pd.DataFrame(data=detailList)
-#     # 将新数据添加到现有数据的末尾
-#     combined_data = existing_data.append(new_data, ignore_index=True)
-#     # 将结果写入到xlsx文件
-#     combined_data.to_excel(filename, index=False)
-#
-# from openpyxl import Workbook
-#
-# if __name__ == '__main__':
-#     filename='test1.xlsx'
-#     # # 创建一个工作簿
-#     workbook = Workbook(filename)
-#     workbook.save(filename)
-#     writeaa()
-# gpdm = '01109.HK'
-# if 'HK' in str(gpdm):
-#     tmp_g = str(gpdm).split('.')[0]
-#     if len(tmp_g) == 5:
-#         gpdm = str(gpdm)[1:]
-#         print(gpdm)
-#     else:
-#         pass
 from base.BaseCore import BaseCore
 baseCore = BaseCore()
-r = baseCore.r
+@retry(tries=3,delay=2)
+def getrequest(url_news):
-# #自增并设置过期时间
-# while True:
+    ip = baseCore.get_proxy()
-#     key = 'mykey'
+    res_news = requests.get(url_news, proxies=ip, timeout=20)
-#     expiration_time = 60  # 设置过期时间 60秒
+    if res_news.status_code != 200:
-#     #设置自增
+        raise
-#     r.incr(key)
+    return res_news
-#
-#
-#
+def rm_style_attr(soup):
-#     value = int(r.get(key).decode())
+    # 查找所有含有style属性的标签
-#
+    style_tags = soup.find_all(style=True)
-#     if value > 10:
+    # 遍历每个style标签
-#         print(value)
+    for style_tag in style_tags:
-#         # 设置过期时间
+        try:
-#         r.expire(key, expiration_time)
+            # 使用正则表达式替换
-#
+            styleattr = style_tag['style']
-#         time.sleep(70)
+            styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
-#         print('------------------')
+            styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
-#         continue
+            styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
-#     # print(value)
+            styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
-#
+            style_tag['style'] = styleattr
-#     print("==========")
+        except:
-# expiration_time = 60
+            continue
-# # 创建PubSub对象
-# p = r.pubsub()
+    # first_div = soup.select('div[id="js_content"]')
-#
+    # # 设置style属性
-# # 订阅过期事件
+    # first_div['style'] = 'width: 814px ; margin: 0 auto;'
-# p.psubscribe('__keyevent@6__:expired')
-# aa = p.listen()
+    first_div = soup.select('div[id="js_content"]')
-# # 监听过期事件
+    if first_div:
-# for message in p.listen():
+        first_div = first_div[0]  # 获取第一个匹配的元素
-#     if message['type'] == 'pmessage':
+        first_div['style'] = 'width: 814px ; margin: 0 auto;'  # 设置style属性
-#         expired_key = message['data'].decode()
-#         print('过期的key:', expired_key)
+    return soup
-#         if expired_key == 'counter':
-#             # 执行重置操作
-#             r.set('counter', 0)
+if __name__ == "__main__":
-#             print('计数器已重置为0')
+    # url_news = "http://mp.weixin.qq.com/s?__biz=MjM5NDMxOTMwNg==&mid=2653175413&idx=1&sn=8c0853ddab6e27799c4452e0b6e63156&chksm=bd5900d08a2e89c698de51f102b7423b33a27522966ca2218ca1b8ef290837b0087173c74bcb#rd"
-#             # 设置自增
+    url_news = "http://mp.weixin.qq.com/s?__biz=MzU4ODQwNTIxMw==&mid=2247528290&idx=4&sn=370655b44dfd31b99984e2eeeb4868e0&chksm=fddf6fd0caa8e6c63a0b5e4fece250415fcb56f03f305450b1434978769b443eaa416342326e#rd"
-#             r.incr('counter')
+    # 修改请求方法,retry 3次
-#             # 设置过期时间
+    try:
-#             r.expire('counter', expiration_time)
+        res_news = getrequest(url_news)
+        # print(res_news)
-for i in range(0, 24, 5):
+    except:
-    print(i)
+        try:
\ No newline at end of file
+            res_news = requests.get(url_news, timeout=20)
+            print('请求成功')
+        except:
+            res_news = None
+            pass
+    soup_news = BeautifulSoup(res_news.content, 'html.parser')
+    if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
+        print('失败')
+    try:
+        news_html = soup_news.find('div', {'id': 'js_content'})
+        news_html['style'] = 'width: 814px ; margin: 0 auto;'
+        #del news_html['style']
+        news_html = rm_style_attr(news_html)
+        # print(news_html)
+        del news_html['id']
+        del news_html['class']
+    except Exception as e:
+        print(e)
+        news_html = None
+    # print(news_html)
+    news_content = news_html.text
+    list_img = news_html.find_all('img')
+    for num_img in range(len(list_img)):
+        img_one = list_img[num_img]
+        url_src = img_one.get('data-src')
+        # print(url_src)
+        if url_src and 'gif' in url_src:
+            url_img = ''
+            img_one.extract()
+        else:
+            try:
+                try:
+                    name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
+                except:
+                    img_one.extract()
+                    continue
+                try:
+                    res = requests.get(url_src, timeout=20)
+                except:
+                    img_one.extract()
+                    continue
+            except Exception as e:
+                print(f'--error--{url_news}-----------{e}')
+    for tag in news_html.descendants:
+        try:
+            del tag['style']
+        except:
+            pass
+    list_section = news_html.find_all('section')
+    for section in list_section:
+        section.name = 'div'
+    print(news_html)
\ No newline at end of file
--- a/test.py
+++ b/test.py
@@ -37,7 +37,7 @@ element.getparent() #获取给定元素的父元素
 # print(res)
 """测试中国执行信息公开网  模拟浏览器"""
-import ddddocr
+# import ddddocr
 from PIL import Image
 import re
@@ -144,34 +144,34 @@ import json
 import requests
 import pymongo
 from base import BaseCore
-baseCore = BaseCore.BaseCore()
+# baseCore = BaseCore.BaseCore()
-log = baseCore.getLogger()
+# log = baseCore.getLogger()
+#
+#
-db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
+# db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
-    '数据源_0504']
+#     '数据源_0504']
+#
-datas = db_storage.find({'postCode':'2'}).limit(1)
+# datas = db_storage.find({'postCode':'2'}).limit(1)
-for data in datas:
+# for data in datas:
-    title = data['titleForeign']
+#     title = data['titleForeign']
-    contentWithTag = data['richTextForeign']
+#     contentWithTag = data['richTextForeign']
-    summary = data['contentForeign']
+#     summary = data['contentForeign']
-    dic_info = {
+#     dic_info = {
-        'title':title,
+#         'title':title,
-        # 'summary':summary,
+#         # 'summary':summary,
-        'contentWithTag':contentWithTag
+#         'contentWithTag':contentWithTag
-    }
+#     }
-    headers = {
+#     headers = {
-        'Content-Type': 'application/json',
+#         'Content-Type': 'application/json',
-    }
+#     }
-    dic_info_ = json.dumps(dic_info)
+#     dic_info_ = json.dumps(dic_info)
-    # print(dic_info_)
+#     # print(dic_info_)
-    # with open('./data.json','w') as f:
+#     # with open('./data.json','w') as f:
-    #     f.write(dic_info_)
+#     #     f.write(dic_info_)
-    # break
+#     # break
-    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
+#     # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
-    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
+#     req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
-    log.info(req.text)
+#     log.info(req.text)
 # import re, datetime
 #
@@ -237,4 +237,228 @@ for data in datas:
 # if __name__ == "__main__":
 #     publishtime_ = '1小时17分钟前'
 #     publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
 #     print(publish_time)
\ No newline at end of file
+# import pandas as pd
+#
+# # 创建一个示例DataFrame
+# df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+#
+# # 要追加的行
+# new_rows = pd.DataFrame({'A': [4, 5], 'B': [7, 8]})
+#
+# # 追加行到原DataFrame
+# df = pd.concat([df, new_rows], ignore_index=True)
+#
+# print(df)
+# import pandas as pd
+#
+# # 假设我们有两个DataFrame
+# df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+# df2 = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
+#
+# # 从df1中取出第一行数据
+# row_to_append = df1.iloc[0].to_frame().T
+# # print(row_to_append)
+#
+# # 将这一行数据追加到df2中
+# # 注意：这里使用ignore_index=True来忽略索引，并重新设置索引
+# # df2 = df2.append(row_to_append, ignore_index=True)
+# df2 = pd.concat([df2, row_to_append], ignore_index=True)
+#
+# # 打印结果
+# print(df2)
+# import openpyxl
+# import redis
+#
+# # 连接到Redis服务器
+# redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
+#
+# # 打开Excel文件
+# workbook = openpyxl.load_workbook('D:\\kkwork\\企业数据\\年报数据任务.xlsx')
+#
+# # 选择要读取的工作表
+# worksheet = workbook['系统批量导出年报名单']
+#
+# # 选择要读取的列
+# column_index = 0  # 选择第2列
+#
+# # 遍历指定列的单元格，并将值放入Redis列表
+# for row in worksheet.iter_rows(values_only=True):
+#     try:
+#         cell_value = row[1] + "|" + row[2]
+#     except:
+#         print(row[1])
+#         continue
+#     # cell_value = row[1]
+#     redis_client.rpush('NianBao:socialcode', cell_value)
+#
+# # 关闭Excel文件
+# workbook.close()
+# def find_empty_id_sequences(id_list):
+#     empty_id_count = 0
+#     empty_id_sequences = []
+#     current_sequence = []
+#
+#     for id in id_list:
+#         if id is None:  # 假设 id 为 None 表示空
+#             empty_id_count += 1
+#             current_sequence.append(empty_id_count)
+#         else:
+#             if empty_id_count > 0:
+#                 empty_id_sequences.append(current_sequence)
+#                 current_sequence = []
+#             empty_id_count = 0
+#
+#     if empty_id_count > 0:
+#         empty_id_sequences.append(current_sequence)
+#
+#     return empty_id_sequences
+#
+# # 示例使用
+# id_list = [1, None, 2, None, None, 3, None, None, None, 4]
+# empty_id_sequences = find_empty_id_sequences(id_list)
+# print(empty_id_sequences)
+# def process_results(results):
+#     # 初始化输出字符串和上一个结果
+#     final_output = ""
+#     previous_res = None
+#
+#     # 初始化连续空的结果索引列表
+#     empty_indices = []
+#     empty_id_sequences = []
+#     all_result = []
+#     # 对结果进行排序
+#     sorted_results = sorted(results.items(), key=lambda x: int(x[0].split(".")[0]))
+#
+#     # 遍历排序后的结果
+#     for index, (_, res) in enumerate(sorted_results):
+#         if res == '':
+#             # 如果是空结果，将索引添加到空结果索引列表中
+#             empty_indices.append(_)
+#
+#         else:
+#             # 如果结果非空，并且连续空的结果列表不为空，
+#             # 则添加连续空的结果索引列表到输出中
+#             if empty_indices:
+#                 # final_output += " ".join(map(str, empty_indices)) + '\n'
+#                 empty_id_sequences.append(empty_indices)
+#                 all_result.append(empty_indices)
+#                 empty_indices = []
+#             # 如果是非空结果并且与上一个结果不同，则添加到输出中
+#             if res != previous_res:
+#                 final_output += res + '\n'
+#                 all_result.append(res)
+#                 previous_res = res
+#
+#     return all_result, empty_id_sequences
+#
+# # 示例使用
+# results = {
+#     "1.0": "",
+#     "2.0": "",
+#     "3.0": "result1",
+#     "4.0": "",
+#     "5.0": "",
+#     "6.0": "result2",
+#     "7.0": "result2",
+#     "8.0": "",
+#     "9.0": "result3"
+# }
+#
+# all_result, empty_id_sequences, dic_index = process_results(results)
+# print(all_result)
+# print(empty_id_sequences)
+# print(dic_index)
+import Levenshtein
+def same_rule(same_list):
+    # 记录最长的一个
+    max_len = 0
+    for item in same_list:
+        if len(item) > max_len:
+            max_len = len(item)
+    char_map_list = []
+    char_map_score = []
+    for i in range(max_len):
+        char_map_list.append(dict())
+        char_map_score.append(dict())
+    for index in range(max_len):
+        for i in range(len(same_list)):
+            if index < len(same_list[i]):
+                char = same_list[i][index]
+                score = same_list[i][index]
+            else:
+                char = ""
+                score = 1
+            if char not in char_map_list[index]:
+                char_map_list[index][char] = 1
+            else:
+                char_map_list[index][char] += 1
+            if char not in char_map_score[index]:
+                char_map_score[index][char] = score
+            else:
+                char_map_score[index][char] += score
+    print(char_map_list)
+    print(char_map_score)
+    #返回个数多的；当个数相同时，返回打分高的
+    result = []
+    for i in range(max_len):
+        print('---------------')
+        print(char_map_list[i].items())
+        print(sorted(char_map_list[i].items(), key=lambda item:item[1], reverse=True))
+        result.append(sorted(char_map_list[i].items(), key=lambda item:item[1],reverse=True)[0][0])
+    return "".join(result)
+def aaaaa(final_output):
+    finall_list = []
+    same_list = []
+    # 处理相似的
+    for result in final_output:
+        print(f"result:{result}")
+        if len(same_list) > 0:
+            ratio = Levenshtein.ratio(result, same_list[-1])
+            if ratio < 0.5:
+                # 差异大于0.5
+                # 对相似的做处理
+                if len(same_list) > 1:
+                    result_ = max(same_list, key=lambda x: (len(x), x))
+                    same_list = [result_]
+                    finall_list[-1] = result_
+                    finall_list.append(result)
+                else:
+                    same_list = [result]
+                    finall_list.append(result)
+            else:
+                same_list.append(result)
+        else:
+            same_list.append(result)
+            finall_list.append(result)
+    print(finall_list)
+if __name__ == '__main__':
+    same_list = ['让我们从一次时光旅行', '开启植物天堂的故事', '地球的午夜', '是在火山喷发中度过的', '到了凌晨三四点', '在海洋深处有了生命的迹象', '清晨6点多', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '种蓝藻细菌', '一种蓝藻细菌', '学会利用二氧化碳水和阳光', '制造生命所需能量', '同时释放出了氧气', '这个被称为光合作用的过程', '为植物世界打开了大门', '此时', '中国的陆地', '也逐渐从海洋露出形成岛屿', '但在相当长的时间里', '陆地十分荒凉没有生机', '这些岩石坚硬', '无法储存水分', '是当时陆地环境的写照', '直到晚上九点多', '也就是四亿年前左右', '些矮小的生命', '开始征服陆地', '她们用一种近似于根的构造', '固定在岩石上', '苔藓', '是陆地最早的拓荒者之', '小', '她们死后的身体', '形成了肥沃的土壤', '让更多的植物可以在这里生存', '从此', '绿色成为植物天堂的底色']
+    # aaa = aaaaa(same_list)
+    #
+    # for i in range(len(same_list)):
+    #     print(i, same_list[i])
+    #
--- a/test/test.py
+++ b/test/test.py
+# -*- coding: utf-8 -*-
+import datetime
+import json
+import re
 import time
+import pandas as pd
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+import urllib3
+from selenium.webdriver.support.wait import WebDriverWait
+db_storageInsert = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').jixie[
+    '企业基本信息']
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '天眼查登录信息']
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from dateutil.relativedelta import relativedelta
+import sys
+# sys.path.append('D:\\KK\\zzsn_spider\\base')
+# sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+# import BaseCore
+from base import BaseCore
+baseCore = BaseCore.BaseCore()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+log = baseCore.getLogger()
+from classtool import Token, File, Tag
+token = Token()
+file = File()
+tag = Tag()
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
-from base.BaseCore import BaseCore
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'
+    # options = webdriver.EdgeOptions()
+    options = {
+        "browserName": "MicrosoftEdge",
+        "ms:edgeOptions": {
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
+        }
+    }
+    session = webdriver.Edge(executable_path=path, capabilities=options)
+    return session
+# 检查登陆状态
+def checklogin(key):
+    t = int(time.time())
+    # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
+    url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
+    driver.get(url)
+    time.sleep(2)
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, 'html.parser')
+    return soup
+# 采集准备
+def redaytowork(com_name):
+    log.info(f'----当前企业{com_name}--开始处理---')
+    count = 0
+    soup = checklogin(com_name)
+    if not soup:
+        log.info("登录失效===重新放入redis")
+        baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
+        token.updateTokeen(id_cookie, 2)
+        # log.info('=====已重新放入redis,失效cookies已删除======')
+        time.sleep(20)
+        return count
+    else:
+        try:
+            searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span',
+                                                                                         class_='index_title-count__lDSjB').text
+        except:
+            try:
+                # todo:可能是搜不到该企业
+                errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
+                if '抱歉' in errormessage:
+                    log.info('=====搜索不到该企业====')
+                    return count
+            except:
+                log.info("登录失效===重新放入redis")
+                baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName_jixie', com_name)
+                token.updateTokeen(id_cookie, 2)
+                # log.info('=====已重新放入redis,cookies已封号======')
+                time.sleep(20)
+                return count
+        else:
+            # 开始采集
+            try:
+                if spiderwork(soup, com_name):
+                    count += 1
+                    log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
+                    token.updateTokeen(id_cookie, 3)
+                    return count
+                else:
+                    return count
+            except Exception as e:
+                log.info(f'====={com_name}=====获取基本信息失败，重新放入redis=====')
+                baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
+                token.updateTokeen(id_cookie, 2)
+                log.info('=====已重新放入redis,cookies已封号======')
+                return count
+def ifbeforename(company_url):
+    driver.get(company_url)
+    time.sleep(2)
+    com_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    try:
+        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
+    except:
+        businessinfo = ''
+    if businessinfo:
+        try:
+            name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
+            value = \
+                businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ',
+                                                                                                          '').replace(
+                    '…', '').replace('\n', '').replace('复制', '').split('（')[0]
+        except:
+            name = '曾用名'
+            value = ''
+        return value
+    else:
+        return ''
+# 采集基本信息和工商信息
+def spiderinfo(company_url):
+    qccid = company_url.split('company/')[1]
+    log.info(f'====={qccid}=====')
+    driver.get(company_url)
+    page_source_detail = driver.page_source
+    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
+    script = com_soup.find('script', attrs={'id': '__NEXT_DATA__'}).text
+    script = json.loads(script)
+    script = script['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']
+    companyName = script['name']
+    updateTime = int(script['updateTimes'])
+    updateTime = datetime.datetime.fromtimestamp(updateTime / 1000).strftime('%Y-%m-%d %H:%M:%S')
+    creditCode = script['creditCode']
+    operName = script['legalPersonName']
+    phoneNumber = script['phoneNumber']
+    webSite = script['websiteList']
+    try:
+        email = script['emailList'][0]
+    except:
+        email = None
+    desc = script['baseInfo']
+    status = script['regStatus']
+    startDate = int(script['estiblishTime'])
+    startDate = datetime.datetime.fromtimestamp(startDate / 1000).strftime('%Y-%m-%d %H:%M:%S')
+    registCapi = script['regCapital']
+    recCap = script['actualCapital']
+    checkDate = int(script['approvedTime'])
+    checkDate = datetime.datetime.fromtimestamp(checkDate / 1000).strftime('%Y-%m-%d %H:%M:%S')
+    orgNo = script['orgNumber']
+    No = script['regNumber']
+    taxpayerNo = script['taxNumber']
+    econKind = script['companyOrgType']
+    termStart = int(script['fromTime'])
+    termStart = datetime.datetime.fromtimestamp(termStart / 1000).strftime('%Y-%m-%d %H:%M:%S')
+    termEnd = script['toTime']
+    termEnd = datetime.datetime.fromtimestamp(termEnd / 1000).strftime('%Y-%m-%d %H:%M:%S')
+    taxpayerType = script['taxQualification']
+    subIndustry = script['industryInfo']['nameLevel3']
+    belogOrg = script['regInstitute']
+    info = script['staffNumRange']
+    canbao = script['socialStaffNum']
+    try:
+        originalName = script['historyNames']
+        originalName = originalName.split('\n')[0]
+    except:
+        originalName = None
+    englishName = script['property3']
+    address = script['taxAddress']
+    scope = script['businessScope']
+    aa_dic = {
+        'name': companyName,  # 企业名称
+        'shortName': None,  # 企业简称
+        'socialCreditCode': creditCode,  # 统一社会信用代码
+        'legalPerson': operName,  # 法定代表人
+        'officialPhone': phoneNumber,  # 电话
+        'officialUrl': webSite,  # 官网
+        'officialEmail': email,  # 邮箱
+        'briefInfo': desc,  # 简介
+        'registerStatus': status,  # 登记状态
+        'incorporationDate': startDate,  # 成立日期
+        'capital': registCapi,  # 注册资本
+        'paidCapital': recCap,  # 实缴资本
+        'approvalDate': checkDate,  # 核准日期
+        'organizationCode': orgNo,  # 组织机构代码
+        'registerNo': No,  # 工商注册号
+        'taxpayerNo': taxpayerNo,  # 纳税人识别号
+        'type': econKind,  # 企业类型
+        'businessStartDate': termStart,  # 营业期限自
+        'businessEndDate': termEnd,  # 营业期限至
+        'taxpayerQualification': taxpayerType,  # 纳税人资质
+        'industry': subIndustry,  # 所属行业
+        'region': None,
+        'province': None,  # 所属省
+        'city': None,  # 所属市
+        'county': None,  # 所属县
+        'registerDepartment': belogOrg,  # 登记机关
+        'scale': info,  # 人员规模
+        'insured': canbao,  # 参保人数
+        'beforeName': originalName,  # 曾用名
+        'englishName': englishName,  # 英文名
+        'importExportEnterpriseCode': None,  # 进出口企业代码
+        'address': address,  # 地址
+        'businessRange': scope,  # 经营范围
+        'status': 0,  # 状态
+        'sourceUpdateTime': updateTime,  # 更新时间
+        'qccId': qccid,
+        'ynDomestic': '',
+        'countryName': '',
+        'securitiesCode': '',
+        'securitiesShortName': '',
+        'listingDate': '',
+        'category': '',
+        'exchange': '',
+        'listingType': '',
+    }
+    for key, value in aa_dic.items():
+        if value == 'None':
+            aa_dic[key] = None
+    db_storageInsert.insert_one(aa_dic)
+def remove_parentheses(text):
+    # 清除中文小括号
+    text = re.sub(r'（|）', '', text)
+    # 清除英文小括号
+    text = re.sub(r'\(|\)', '', text)
+    return text.replace(' ', '')
+# 判断名称是否统一
+def spiderwork(soup, receptname):
+    company_url = ''
+    try:
+        company_list = soup.find_all('div', class_='index_search-box__7YVh6')
+    except:
+        log.info(f'====={com_name}=====获取基本信息失败，重新放入redis=====')
+        baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie', com_name)
+        token.updateTokeen(id_cookie, 2)
+        log.info('=====已重新放入redis,cookies已封号======')
+        return False
+    # receptname = '小米通讯技术有限公司'
+    for compamy in company_list:
+        info_t = compamy.find('div', class_='index_name__qEdWi')
+        getname = info_t.find('span').text
+        log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
+        if receptname and getname == receptname:
+            company_url = info_t.find('a')['href']
+            break
+        elif not receptname:
+            company_url = info_t.find('a')['href']
+            break
+        else:
+            jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
+            if remove_parentheses(receptname) == jian_name:
+                log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
+                company_url = info_t.find('a')['href']
+                break
+            else:
+                continue
+    if company_url:
+        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
+        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
+        # 采集基本信息和工商信息
+        spiderinfo(company_url)
+    else:
+        # 判断是否是曾用名
+        getname = ''
+        for child in company_list[0].find_all():
+            if child.has_attr('class'):
+                print(child['class'])
+                if 'index_name' in child['class'][0]:
+                    getname = child.text
+                    company_url = child.find('a')['href']
+                    break
+        if getname:
+            log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
+            beforename = ifbeforename(company_url)
+            if beforename == receptname:
+                spiderinfo(company_url)
+            else:
+                # 没有搜到相同的企业名称
+                log.info('没有搜索到相同企业名称')
+                baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie_no',com_name)
+                return False
+        else:
+            # 没有搜到相同的企业名称
+            log.info('没有搜索到相同企业名称')
+            baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode_jixie_no',com_name)
+            return False
+    return True
-baseCore = BaseCore()
+def login():
-log =baseCore.getLogger()
+    # time.sleep(10)
+    cookies_list, id_cookie, user_name = token.get_cookies()
+    log.info(f'=====当前使用的是{user_name}的cookie======')
+    for cookie in cookies_list:
+        driver.add_cookie(cookie)
+    time.sleep(5)
+    driver.refresh()
+    # url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
+    # driver.get(url_test)
+    # # driver.get('https://www.qcc.com/')
+    time.sleep(5)
+    return driver, id_cookie
 if __name__ == '__main__':
-    path = r'F:\spider\115\chromedriver.exe'
+    taskType = '基本信息/天眼查'
-    driver = baseCore.buildDriver(path,headless=False)
+    driver = create_driver()
-    # service = Service(r'F:\spider\115\chromedriver.exe')
+    url = 'https://www.tianyancha.com/'
-    # chrome_options = webdriver.ChromeOptions()
+    driver.get(url)
-    # # chrome_options.add_argument('--headless')
+    driver.maximize_window()
-    # # chrome_options.add_argument('--disable-gpu')
+    while True:
-    # chrome_options.add_experimental_option(
+        driver, id_cookie = login()
-    #     "excludeSwitches", ["enable-automation"])
+        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
-    # chrome_options.add_experimental_option('useAutomationExtension', False)
+        headers = {
-    # chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-    # chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent())
+            'Accept-Encoding': 'gzip, deflate, br',
-    #
+            'Accept-Language': 'zh-CN,zh;q=0.9',
-    # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
+            'Cache-Control': 'max-age=0',
-    # with open('stealth.min.js') as f:
+            'Connection': 'keep-alive',
-    #     js = f.read()
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-    #
+            'Cookie': 'TYCID=6f6298905d3011ee96146793e725899d; ssuid=3467188160; _ga=GA1.2.1049062268.1697190322; HWWAFSESID=2eb035742bde209aa60; HWWAFSESTIME=1706586308439; csrfToken=bT_looAjInHGeAnvjjl12L9v; bannerFlag=true; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=0; tyc-user-phone=%255B%252216603863075%2522%252C%2522152%25203756%25200528%2522%252C%2522159%25200367%25203315%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22310689501%22%2C%22first_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThhZDY5NmEyZWY2ODAtMGFlNWNkOTI5M2ExNTM4LTI2MDMxZjUxLTkyMTYwMC0xOGFkNjk2YTJmMGRjNSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMxMDY4OTUwMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22310689501%22%7D%2C%22%24device_id%22%3A%2218ad696a2ef680-0ae5cd9293a1538-26031f51-921600-18ad696a2f0dc5%22%7D; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2218703752600%22%2C%22userId%22%3A%22310689501%22%7D; tyc-user-info-save-time=1707008605562; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwNzAwODYwNSwiZXhwIjoxNzA5NjAwNjA1fQ.i8WEUrXjG2X__SnGGlnjwNXyOEdXlslrnvzvKZ_xlVA0rdjdsYHdaieAzkmIjoKbuv6Lc4Eqpb70hWIlq2zeoQ; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1705286979,1706586312; searchSessionId=1707118324.99879267;'
-    # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        }
-    #     "source": js
+        start_time = time.time()
-    # })
+        # 获取企业信息
-    gpdm = '9021.T'
+        com_name = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode_jixie')
-    url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
+        if com_name == '' or com_name is None:
-    driver.get(url)
+            flag = True
\ No newline at end of file
+            while flag:
+                log.info('--------已没有数据---------')
+                time.sleep(30)
+                if not baseCore.check_mysql_conn(cnx_):
+                    # 144数据库
+                    cnx_ = baseCore.cnx
+                    cursor_ = cnx_.cursor()
+                    log.info('===11数据库重新连接成功===')
+                com_name = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode_jixie')
+                if com_name:
+                    flag = False
+                    log.info("-----已添加数据------")
+                    # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+                    continue
+            continue
+        count = redaytowork(com_name)
+        time.sleep(10)
+        break
+    baseCore.close()