企查查脚本维护

f7886002 · 薛凌堃 · 08e4725c · f7886002 · f7886002
--- a/comData/BaseInfo_qcc/baseinfo1113.py
+++ b/comData/BaseInfo_qcc/baseinfo1113.py
@@ -292,7 +292,7 @@ def dic_handle(result_dic):
    return aa_dict
 # 采集准备
-def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    # if social_code:
    #     dic_info = baseCore.getInfomation(social_code)
@@ -338,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
        else:
            # 开始采集
            try:
-                if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+                if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
                    count += 1
                    log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
                    token.updateTokeen(id_cookie,3)
@@ -373,7 +373,7 @@ def ifbeforename(company_url):
        return ''
 # 采集基本信息和工商信息
-def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    qccid = company_url.split('firm/')[1].split('.html')[0]
    # 将采集到的企查查id更新
    updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
@@ -463,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingDate'] = listingDate
        aa_dic['category'] = category
        aa_dic['exchange'] = exchange
+        aa_dic['listingType'] = listType
        # print(aa_dic)
        sendkafka(aa_dic)
@@ -482,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingDate'] = listingDate
        aa_dic['category'] = category
        aa_dic['exchange'] = exchange
+        aa_dic['listingType'] = listType
        sendkafka(aa_dic)
 # 判断名称是否统一
-def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    company_url = ''
    try:
        company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
        # 采集基本信息和工商信息
-        spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
+        spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
    else:
        # 判断是否是曾用名
        tr = tr_list[:1][0]
@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        company_url = info_t.find('a')['href']
        beforename = ifbeforename(company_url)
        if beforename == receptname:
-            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
+            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
        else:
            #没有搜到相同的企业名称
            data = [com_name, social_code]
@@ -549,6 +549,7 @@ if __name__ == '__main__':
        else:
            log.info('==========已无cookies==========')
            time.sleep(30)
            continue
        id_cookie = cookieinfo[0]
        cookie_ = json.loads(cookieinfo[1])
@@ -579,8 +580,8 @@ if __name__ == '__main__':
        }
        start_time = time.time()
        # 获取企业信息
-        company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
+        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
-        # company_field = '91220101606092819L||'
+        company_field = '913300007125582210||'
        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
            baseCore.sendEmail(file_name)
@@ -595,6 +596,11 @@ if __name__ == '__main__':
            while flag:
                log.info('--------已没有数据---------')
                time.sleep(30)
+                if not baseCore.check_mysql_conn(cnx_):
+                    # 144数据库
+                    cnx_ = baseCore.cnx
+                    cursor_ = cnx_.cursor()
+                    log.info('===11数据库重新连接成功===')
                company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
                if company_field:
                    flag = False
@@ -604,26 +610,28 @@ if __name__ == '__main__':
            continue
        social_code = company_field.split('|')[0]
-        com_name = company_field.split('|')[2].replace(' ', '')
+        com_name = company_field.split('|')[1].replace(' ', '')
-        ynDomestic = company_field.split('|')[15]
+        # ynDomestic = company_field.split('|')[15]
-        countryName = company_field.split('|')[16]
+        # countryName = company_field.split('|')[16]
-        securitiesCode = company_field.split('|')[17]
+        # securitiesCode = company_field.split('|')[17]
-        securitiesShortName = company_field.split('|')[18]
+        # securitiesShortName = company_field.split('|')[18]
-        listingDate = company_field.split('|')[21]
+        # listingDate = company_field.split('|')[21]
-        category = company_field.split('|')[19]
+        # category = company_field.split('|')[19]
-        exchange = company_field.split('|')[20]
+        # exchange = company_field.split('|')[20]
-        # ynDomestic = ''
+        # listType = company_field.split('|')[21]
-        # countryName = ''
+        ynDomestic = ''
-        # securitiesCode = ''
+        countryName = ''
-        # securitiesShortName = ''
+        securitiesCode = ''
-        # listingDate = ''
+        securitiesShortName = ''
-        # category = ''
+        listingDate = ''
-        # exchange = ''
+        category = ''
+        exchange = ''
-        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
+        listType = ''
+        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
        time.sleep(2)
-        # break
+        break
        # baseCore.r.close()
        # baseCore.sendEmail(file_name)
        # 信息采集完成后将该企业的采集次数更新

--- a/comData/BaseInfo_qcc/baseinfo1122.py
+++ b/comData/BaseInfo_qcc/baseinfo1122.py
@@ -516,7 +516,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
        # 采集基本信息和工商信息
-        spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
+        spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
    else:
        # 判断是否是曾用名
        tr = tr_list[:1][0]
@@ -526,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        company_url = info_t.find('a')['href']
        beforename = ifbeforename(company_url)
        if beforename == receptname:
-            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
+            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
        else:
            #没有搜到相同的企业名称
            data = [com_name, social_code]