天眼查基本信息维护

52e227da · 薛凌堃 · a86fe277 · 52e227da · 52e227da
--- a/comData/BaseInfo_qcc/classtool.py
+++ b/comData/BaseInfo_qcc/classtool.py
@@ -49,8 +49,8 @@ class File():
 class Token():
    # 获取token
    def getToken(self):
-        cursor.execute(f"select id,cookies from QCC_token  where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
-        # cursor.execute(f" select id, cookies from QCC_token")
+        # cursor.execute(f"select id,cookies from QCC_token  where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
+        cursor.execute(f" select id, cookies from QCC_token where id = 63")
        # rows = cursor.fetchall()
        # cnx.commit()
        # if rows:

--- a/comData/Tyc/baseinfo0130_tyc.py
+++ b/comData/Tyc/baseinfo0130_tyc.py
@@ -81,15 +81,30 @@ def baseinfo(com_soup):
        # print(info)
        value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
        if name == '法定代表人':
-            value = cominfo.find('a').text
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
        if name == '电话':
-            value = cominfo.find('span').text
+            try:
+                value = cominfo.find('span').text
+            except:
+                value = None
        if name == '邮箱':
-            value = cominfo.find('a').text
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
        if name == '网址':
-            value = cominfo.find('a').text
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
        if name == '地址':
-            value = cominfo.find('span').text
+            try:
+                value = cominfo.find('span').text
+            except:
+                value = None

        data[name] = value
        # print("==================")
@@ -141,7 +156,10 @@ def dic_handle(result_dic):
    try:
        Status = result_dic['经营状态']
    except:
-        Status = None
+        try:
+            Status = result_dic['公司现状']
+        except:
+            Status = None

    try:
        StartDate = result_dic['成立日期']
@@ -198,31 +216,31 @@ def dic_handle(result_dic):
    except:
        TaxpayerType = None

-    # try:
-    #     SubIndustry = result_dic['国标行业']
-    # except:
-    #     SubIndustry = ''
-
    try:
-        region = result_dic['所属地区']
+        SubIndustry = result_dic['国标行业']
    except:
-        region = None
-    try:
-        pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
-        matches = re.match(pattern, region)
-        Province = matches.group(1)
-        City = matches.group(2)
-        County = matches.group(3)
-        if Province is None:
-            for zxs in zxss:
-                if zxs in region:
-                    Province = zxs
-                    break
+        SubIndustry = None

-    except:
-        Province = None
-        City = None
-        County = None
+    # try:
+    #     region = result_dic['所属地区']
+    # except:
+    #     region = None
+    # try:
+    #     pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
+    #     matches = re.match(pattern, region)
+    #     Province = matches.group(1)
+    #     City = matches.group(2)
+    #     County = matches.group(3)
+    #     if Province is None:
+    #         for zxs in zxss:
+    #             if zxs in region:
+    #                 Province = zxs
+    #                 break
+
+    # except:
+    #     Province = None
+    #     City = None
+    #     County = None

    try:
        BelongOrg = result_dic['登记机关']
@@ -285,11 +303,11 @@ def dic_handle(result_dic):
        'businessStartDate': TermStart,  # 营业期限自
        'businessEndDate': TeamEnd,  # 营业期限至
        'taxpayerQualification': TaxpayerType,  # 纳税人资质
-        'industry': None,  # 所属行业
-        'region': region,
-        'province': Province,  # 所属省
-        'city': City,  # 所属市
-        'county': County,  # 所属县
+        'industry': SubIndustry,  # 所属行业
+        'region': None,
+        'province': None,  # 所属省
+        'city': None,  # 所属市
+        'county': None,  # 所属县
        'registerDepartment': BelongOrg,  # 登记机关
        'scale': Info,  # 人员规模
        'insured': can_bao,  # 参保人数
@@ -326,7 +344,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
    log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
    count = 0
    # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
-    if social_code:
+    if social_code and 'ZZSN' not in social_code and 'ZD' not in social_code:
        soup = checklogin(social_code)
    else:
        soup = checklogin(com_name)
@@ -410,7 +428,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
            # print(td_count)
            td_list = tr.find_all('td')
            td_count = len(td_list)
-
            name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
            # print(name_list)
            # value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
@@ -428,7 +445,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
                    tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
                    # for value_tag in value_tag_list:
                    value_list.append(value_tag.text.replace('\xa0', ''))
-
            # print(value_list)
            if len(name_list) == len(value_list):
                for i in range(len(name_list)):
@@ -439,10 +455,30 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
                    if name_list[i] == '法定代表人':
                        value_list[i] = value_list[i].split('任职')[0]
                        dic_buseniss[name_list[i]] = value_list[i]
-        del dic_buseniss['天眼评分']
+        try:
+            del dic_buseniss['天眼评分']
+        except:
+            pass
        # print(dic_buseniss)
        result_dict = getinfo(dic_buseniss, data_baseinfo)
-
+        # 主要针对香港台湾企业，社会信用代码传为给定的
+        try:
+            result_dict['统一社会信用代码']
+        except:
+            # log.info('未获取到统一社会信用代码')
+            if social_code:
+                result_dict['统一社会信用代码'] = social_code
+            else:
+                # 如果未给定社会信用代码，则返回
+                return False
+        if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
+            result_dict['企业名称'] = result_dict['企业名称'][1:-1]
+        if result_dict['企业名称'] == '-' and com_name:
+            result_dict['企业名称'] = com_name
+        elif not com_name:
+            return False
+        else:
+            pass
        # print(result_dict)
        # 采集成功的企业
        data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
@@ -460,9 +496,28 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingType'] = listType
        # print(aa_dic)
        sendkafka(aa_dic)
+        # print(aa_dic)

    else:
        data_baseinfo = baseinfo(com_soup)
+        # 主要针对香港台湾企业，社会信用代码传为给定的
+        try:
+            data_baseinfo['统一社会信用代码']
+        except:
+            log.info('未获取到统一社会信用代码')
+            if social_code:
+                data_baseinfo['统一社会信用代码'] = social_code
+            else:
+                # 如果未给定社会信用代码，则返回
+                return False
+        if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
+            data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
+        if data_baseinfo['企业名称'] == '-' and com_name:
+            data_baseinfo['企业名称'] = com_name
+        elif not com_name:
+            return False
+        else:
+            pass
        # 采集成功的企业
        data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
        file.appenddata(file_name, '获取基本信息成功企业', data)
@@ -479,11 +534,18 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingType'] = listType
        sendkafka(aa_dic)

+def remove_parentheses(text):
+    # 清除中文小括号
+    text = re.sub(r'（|）', '', text)
+    # 清除英文小括号
+    text = re.sub(r'\(|\)', '', text)
+    return text.replace(' ', '')
+
 # 判断名称是否统一
 def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    company_url = ''
    try:
-        company_list = soup.find('div', class_='index_search-box__7YVh6')
+        company_list = soup.find_all('div', class_='index_search-box__7YVh6')
    except:
        log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
        baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
@@ -496,7 +558,6 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        info_t = compamy.find('div', class_='index_name__qEdWi')
        getname = info_t.find('span').text
        log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
-
        if receptname and getname == receptname:
            company_url = info_t.find('a')['href']
            break
@@ -504,7 +565,13 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
            company_url = info_t.find('a')['href']
            break
        else:
-            continue
+            jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
+            if remove_parentheses(receptname) == jian_name:
+                log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
+                company_url = info_t.find('a')['href']
+                break
+            else:
+                continue
    if company_url:
        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
@@ -512,30 +579,33 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
    else:
        # 判断是否是曾用名
+        getname = ''
        for child in company_list[0].find_all():
            if child.has_attr('class'):
                print(child['class'])
-                if 'index_name' in child['class']:
+                if 'index_name' in child['class'][0]:
                    getname = child.text
                    company_url = child.find('a')['href']
                    break
-        else:
-            # 没有搜到相同的企业名称
-            data = [com_name, social_code]
-            file.appenddata(file_name, '需处理企业', data)
-            time.sleep(2)
-            return False
        # tr = company_list[:1][0]
        # info_t = tr.find('div', class_='index_name__qEdWi')
        # getname = info_t.find('span').text
-        log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
-        beforename = ifbeforename(company_url)
-        if beforename == receptname:
-            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name)
+        if getname:
+            log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
+            beforename = ifbeforename(company_url)
+            if beforename == receptname:
+                spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType,
+                           ynDomestic, countryName, file_name)
+            else:
+                # 没有搜到相同的企业名称
+                data = [com_name, social_code]
+                file.appenddata(file_name, '需处理企业', data)
+                time.sleep(2)
+                return False
        else:
-            #没有搜到相同的企业名称
+            # 没有搜到相同的企业名称
            data = [com_name, social_code]
-            file.appenddata(file_name, '需处理企业',data)
+            file.appenddata(file_name, '需处理企业', data)
            time.sleep(2)
            return False
    return True
@@ -546,7 +616,7 @@ if __name__ == '__main__':
    # driver, id_cookie = login()
    while True:
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
-        file_name = f'./国内企业基本信息采集情况.xlsx'
+        file_name = f'./data/国内企业基本信息采集情况.xlsx'
        file.createFile(file_name)
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -564,8 +634,9 @@ if __name__ == '__main__':
        s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
-        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
-        company_field = '91110000710925016E||'
+        company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
+        # company_field = '|北京华信瑞德信息技术有限公司|北京华信瑞德信息技术有限公司|||||||||||||1|中国内地|||||||'
+
        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
            baseCore.sendEmail(file_name)
@@ -592,26 +663,26 @@ if __name__ == '__main__':
                    baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
                    continue
            continue
-
+        # company_field_ = f'|{company_field}'
        social_code = company_field.split('|')[0]
-        com_name = company_field.split('|')[1].replace(' ', '')
-
-        # ynDomestic = company_field.split('|')[15]
-        # countryName = company_field.split('|')[16]
-        # securitiesCode = company_field.split('|')[17]
-        # securitiesShortName = company_field.split('|')[18]
-        # listingDate = company_field.split('|')[21]
-        # category = company_field.split('|')[19]
-        # exchange = company_field.split('|')[20]
-        # listType = company_field.split('|')[21]
-        ynDomestic = None
-        countryName = None
-        securitiesCode = None
-        securitiesShortName = None
-        listingDate = None
-        category = None
-        exchange = None
-        listType = None
+        com_name = company_field.split('|')[2].replace(' ', '')
+
+        ynDomestic = company_field.split('|')[15]
+        countryName = company_field.split('|')[16]
+        securitiesCode = company_field.split('|')[17]
+        securitiesShortName = company_field.split('|')[18]
+        listingDate = company_field.split('|')[21]
+        category = company_field.split('|')[19]
+        exchange = company_field.split('|')[20]
+        listType = company_field.split('|')[21]
+        # ynDomestic = None
+        # countryName = None
+        # securitiesCode = None
+        # securitiesShortName = None
+        # listingDate = None
+        # category = None
+        # exchange = None
+        # listType = None

        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
                            listType, ynDomestic, countryName, file_name)
@@ -622,5 +693,5 @@ if __name__ == '__main__':
        # 信息采集完成后将该企业的采集次数更新
        # runType = 'BaseInfoRunCount'
        # baseCore.updateRun(social_code, runType, count)
-        break
+        # break
    baseCore.close()
\ No newline at end of file