12/21

08e4725c · 薛凌堃 · 8f2915d4 · 08e4725c · 08e4725c · 08e4725c
--- a/REITs专题数据/BaseCore.py
+++ b/REITs专题数据/BaseCore.py
--- a/REITs专题数据/DisInfo-shanghai.py
+++ b/REITs专题数据/DisInfo-shanghai.py
--- a/REITs专题数据/FundAnncmnt-shenzhen.py
+++ b/REITs专题数据/FundAnncmnt-shenzhen.py
--- a/REITs专题数据/FundsList-shenzhen.py
+++ b/REITs专题数据/FundsList-shenzhen.py
--- a/REITs专题数据/LawRules_2_shenzhen.py
+++ b/REITs专题数据/LawRules_2_shenzhen.py
--- a/REITs专题数据/LawRules_shenzhen.py
+++ b/REITs专题数据/LawRules_shenzhen.py
--- a/REITs专题数据/MarketOverview-shenzhen.py
+++ b/REITs专题数据/MarketOverview-shenzhen.py
--- a/REITs专题数据/ProductQuotes-shanghai.py
+++ b/REITs专题数据/ProductQuotes-shanghai.py
--- a/REITs专题数据/ProjectDynamics-shanghai.py
+++ b/REITs专题数据/ProjectDynamics-shanghai.py
--- a/REITs专题数据/ProjectDynamics-shenzhen.py
+++ b/REITs专题数据/ProjectDynamics-shenzhen.py
--- a/REITs专题数据/REITsDailyFund-shanghai.py
+++ b/REITs专题数据/REITsDailyFund-shanghai.py
--- a/REITs专题数据/RuleGuide_shanghai.py
+++ b/REITs专题数据/RuleGuide_shanghai.py
--- a/REITs专题数据/RuleGuide_shenzhen.py
+++ b/REITs专题数据/RuleGuide_shenzhen.py
--- a/REITs专题数据/cushman.py
+++ b/REITs专题数据/cushman.py
--- a/REITs专题数据/info-shanghai.py
+++ b/REITs专题数据/info-shanghai.py
--- a/REITs专题数据/policy_beijing.py
+++ b/REITs专题数据/policy_beijing.py
--- a/REITs专题数据/policy_chongqing.py
+++ b/REITs专题数据/policy_chongqing.py
--- a/REITs专题数据/policy_fujian.py
+++ b/REITs专题数据/policy_fujian.py
--- a/REITs专题数据/policy_guangdong.py
+++ b/REITs专题数据/policy_guangdong.py
--- a/REITs专题数据/policy_guangxi.py
+++ b/REITs专题数据/policy_guangxi.py
--- a/REITs专题数据/policy_gwy.py
+++ b/REITs专题数据/policy_gwy.py
--- a/REITs专题数据/policy_hainan.py
+++ b/REITs专题数据/policy_hainan.py
--- a/REITs专题数据/policy_heilongjiang.py
+++ b/REITs专题数据/policy_heilongjiang.py
--- a/REITs专题数据/policy_hubei.py
+++ b/REITs专题数据/policy_hubei.py
--- a/REITs专题数据/policy_jiangsu.py
+++ b/REITs专题数据/policy_jiangsu.py
--- a/REITs专题数据/policy_jiangxi.py
+++ b/REITs专题数据/policy_jiangxi.py
--- a/REITs专题数据/policy_jilin.py
+++ b/REITs专题数据/policy_jilin.py
--- a/REITs专题数据/policy_liaoning.py
+++ b/REITs专题数据/policy_liaoning.py
--- a/REITs专题数据/policy_neimenggu.py
+++ b/REITs专题数据/policy_neimenggu.py
--- a/REITs专题数据/policy_shandong.py
+++ b/REITs专题数据/policy_shandong.py
--- a/REITs专题数据/policy_shanghai.py
+++ b/REITs专题数据/policy_shanghai.py
--- a/REITs专题数据/policy_shanxi.py
+++ b/REITs专题数据/policy_shanxi.py
--- a/REITs专题数据/policy_sichuan.py
+++ b/REITs专题数据/policy_sichuan.py
--- a/REITs专题数据/policy_tianjin.py
+++ b/REITs专题数据/policy_tianjin.py
--- a/REITs专题数据/policy_yunnan.py
+++ b/REITs专题数据/policy_yunnan.py
--- a/REITs专题数据/policy_zhejiang.py
+++ b/REITs专题数据/policy_zhejiang.py
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
--- a/REITs专题数据/start.py
+++ b/REITs专题数据/start.py
-import reits
+import reits
 import reits
-import policy_beijing, policy_chongqing, policy_fujian, policy_guangdong
+import policy_chongqing, policy_fujian, policy_guangdong
 import policy_guangxi, policy_gwy, policy_hainan, policy_heilongjiang, policy_hubei, policy_jiangsu

 import policy_jiangxi, policy_jilin, policy_liaoning, policy_neimenggu, policy_shandong, policy_hubei
 import policy_shanxi, policy_sichuan, policy_tianjin, policy_yunnan, policy_zhejiang
 import RuleGuide_shanghai, RuleGuide_shenzhen
 import LawRules_shenzhen, LawRules_2_shenzhen
+from REITs_policyData.policy_beijing import beijing
+

 if __name__ == "__mian__":
-    policy_beijing.beijing()
+    beijing()
    reits.sse()
    reits.reform()
    reits.hebei()

--- a/REITs专题数据/国际市场/Singapore Exchange.py
+++ b/REITs专题数据/国际市场/Singapore Exchange.py
--- a/comData/BaseInfo_qcc/baseinfo1122.py
+++ b/comData/BaseInfo_qcc/baseinfo1122.py
 # -*- coding: utf-8 -*-
-
-"""
-模拟点击的方法不行，涉及到需要账号登录
-"""
 import json
 import re
 import time
@@ -296,7 +292,7 @@ def dic_handle(result_dic):
    return aa_dict

 # 采集准备
-def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):

    # if social_code:
    #     dic_info = baseCore.getInfomation(social_code)
@@ -342,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
        else:
            # 开始采集
            try:
-                if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+                if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
                    count += 1
                    log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
                    token.updateTokeen(id_cookie,3)
@@ -377,7 +373,7 @@ def ifbeforename(company_url):
        return ''

 # 采集基本信息和工商信息
-def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    qccid = company_url.split('firm/')[1].split('.html')[0]
    # 将采集到的企查查id更新
    updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
@@ -467,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingDate'] = listingDate
        aa_dic['category'] = category
        aa_dic['exchange'] = exchange
-
+        aa_dic['listingType'] = listType
        # print(aa_dic)
        sendkafka(aa_dic)

@@ -486,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        aa_dic['listingDate'] = listingDate
        aa_dic['category'] = category
        aa_dic['exchange'] = exchange
-
+        aa_dic['listingType'] = listType
        sendkafka(aa_dic)

 # 判断名称是否统一
-def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
+def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    company_url = ''
    try:
        company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
@@ -530,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        company_url = info_t.find('a')['href']
        beforename = ifbeforename(company_url)
        if beforename == receptname:
-            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
+            spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
        else:
            #没有搜到相同的企业名称
            data = [com_name, social_code]
@@ -544,7 +540,7 @@ if __name__ == '__main__':

    while True:
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
-        file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
+        file_name = f'./data/国内企业基本信息采集情况.xlsx'
        file.createFile(file_name)

        cookieinfo = token.getToken()
@@ -553,6 +549,7 @@ if __name__ == '__main__':
        else:
            log.info('==========已无cookies==========')
            time.sleep(30)
+
            continue
        id_cookie = cookieinfo[0]
        cookie_ = json.loads(cookieinfo[1])
@@ -599,6 +596,11 @@ if __name__ == '__main__':
            while flag:
                log.info('--------已没有数据---------')
                time.sleep(30)
+                if not baseCore.check_mysql_conn(cnx_):
+                    # 144数据库
+                    cnx_ = baseCore.cnx
+                    cursor_ = cnx_.cursor()
+                    log.info('===11数据库重新连接成功===')
                company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
                if company_field:
                    flag = False
@@ -608,7 +610,7 @@ if __name__ == '__main__':
            continue

        social_code = company_field.split('|')[0]
-        com_name = company_field.split('|')[2].replace(' ', '')
+        com_name = company_field.split('|')[1].replace(' ', '')

        ynDomestic = company_field.split('|')[15]
        countryName = company_field.split('|')[16]
@@ -617,6 +619,7 @@ if __name__ == '__main__':
        listingDate = company_field.split('|')[21]
        category = company_field.split('|')[19]
        exchange = company_field.split('|')[20]
+        listType = company_field.split('|')[21]
        # ynDomestic = ''
        # countryName = ''
        # securitiesCode = ''
@@ -625,8 +628,8 @@ if __name__ == '__main__':
        # category = ''
        # exchange = ''

-        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
-        time.sleep(40)
+        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
+        time.sleep(2)
        # break
        # baseCore.r.close()
        # baseCore.sendEmail(file_name)

--- a/comData/bond_zjh/zhaiquan.py
+++ b/comData/bond_zjh/zhaiquan.py
--- a/comData/policylaw/gwyparts.py
+++ b/comData/policylaw/gwyparts.py
@@ -94,7 +94,7 @@ def get_content2():
                    child_type = content_dict['childtype']  # 主题分类
                except:
                    child_type = ''
-                # # 判断是否已经爬取过
+                # 判断是否已经爬取过
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
@@ -102,6 +102,7 @@ def get_content2():
                    time.sleep(1)
                    continue
                try:
+                    # href = 'https://www.gov.cn/zhengce/zhengceku/202312/content_6921452.htm'
                    resp = requests.get(url=href, headers=baseTool.headers, verify=False)
                    resp.encoding = resp.apparent_encoding
                    resp_text = resp.text
@@ -120,9 +121,7 @@ def get_content2():
                        except Exception as e:
                            log.info(f'---{href}--------{e}-------')
                            continue
-                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
-                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
-                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.odf' in file_href:
+                        if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:

--- a/comData/zhuanli/tyc_zhuanli.py
+++ b/comData/zhuanli/tyc_zhuanli.py
+import requests,time,re,random
+from base import BaseCore
+import pandas as pd
+from bs4 import BeautifulSoup as bs
+from comData.Tyc.getTycId import getTycIdByXYDM
+baseCore = BaseCore.BaseCore()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+log = baseCore.getLogger()
+taskType = '天眼查专利/国内上市'
+
+
+def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
+    start_time = time.time()
+    log.info(f'===正在处理第{page}页===')
+    # list_all_info = []
+
+    t = int(time.time() * 1000)
+    header = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Host': 'capi.tianyancha.com',
+        'Origin': 'https://www.tianyancha.com',
+        'Referer': 'https://www.tianyancha.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
+        'X-TYCID': '6f6298905d3011ee96146793e725899d',
+        'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'version': 'TYC-Web'
+    }
+    url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
+
+    try:
+        ip = baseCore.get_proxy()
+    except:
+        time.sleep(2)
+        ip = baseCore.get_proxy()
+    try:
+        res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
+    except:
+        for i in range(3):
+            try:
+                res_j = requests.get(url=url, headers=header, verify=False).json()
+            except:
+                time.sleep(2)
+                continue
+    # print(res_j)
+    list_all = res_j['data']['items']
+    # print(list_all)
+    if list_all:
+        for one_zhuanli in list_all:
+            title = one_zhuanli['title']
+            try:
+                shenqingri = one_zhuanli['applicationTime']
+            except:
+                shenqingri = ''
+            try:
+                shenqing_code = one_zhuanli['patentNum']
+            except:
+                shenqing_code = ''
+            try:
+                leixing = one_zhuanli['patentType']
+            except:
+                leixing = ''
+            try:
+                status = one_zhuanli['lprs']
+            except:
+                status = ''
+            try:
+                gongkairi = one_zhuanli['pubDate']
+            except:
+                gongkairi = ''
+            try:
+                gongkai_code = one_zhuanli['pubnumber']
+            except:
+                gongkai_code = ''
+            try:
+                famingren = one_zhuanli['inventor']
+            except:
+                famingren = ''
+            try:
+                shenqingren = one_zhuanli['applicantName']
+            except:
+                shenqingren = ''
+            try:
+                gongneng = one_zhuanli['cat']
+            except:
+                gongneng = ''
+            try:
+                uuid = one_zhuanli['uuid']
+            except:
+                uuid = ''
+
+            dic_info = {
+                '企业名称': com_name,
+                '统一信用代码': social_code,
+                '专利名称': title,
+                '申请日': shenqingri,
+                '申请号': shenqing_code,
+                '专利类型': leixing,
+                '专利状态': status,
+                '公开日': gongkairi,
+                '公开号': gongkai_code,
+                '发明人': famingren,
+                '申请人': shenqingren,
+                '功能': gongneng,
+                '天眼查详情id': uuid,
+                '年份': shenqingri[:4]
+            }
+            selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
+            cursor.execute(selectSql)
+            count = cursor.fetchone()[0]
+            if count > 0:
+                log.info(f"{com_name}-------{shenqing_code}---已经存在")
+                continue
+            else:
+                values_tuple = tuple(dic_info.values())
+                # log.info(f"{gpdm}-------{companyname}---新增")
+                insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+                cursor.execute(insertSql, values_tuple)
+                cnx.commit()
+                log.info(f"{com_name}-------{shenqing_code}---新增")
+            time.sleep(2)
+            # list_all_info.append(dic_info)
+        log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
+        return page
+    else:
+        return 0
+
+if __name__ == "__main__":
+    while True:
+        list_all_info = []
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
+        # social_code = '9111010566840059XP'
+        # 判断 如果Redis中已经没有数据，则等待
+        if social_code == None:
+            # time.sleep(20)
+            break
+        start = time.time()
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                pass
+            else:
+                # 数据重新塞入redis
+                baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
+                continue
+            id = data[0]
+            com_name = data[1]
+            xydm = data[2]
+            tycid = data[11]
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(xydm)
+                    if retData['tycData'] and retData['reput']:
+                        tycid = retData['tycData']['id']
+                        # todo:写入数据库
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor.execute(updateSql)
+                        cnx.commit()
+                    elif not retData['tycData'] and retData['reput']:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={social_code}====重新放入redis====')
+                        baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
+                        continue
+                    elif not retData['reput'] and not retData['tycData']:
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
+                    continue
+            count = data[17]
+            log.info(f"{id}---{xydm}----{tycid}----开始处理")
+            page = 1
+            while True:
+                page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
+                if page != 0:
+                    page += 1
+
+                else:
+                    # print(len(list_all_info))
+                    # df_all_info = pd.DataFrame(list_all_info)
+                    # df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
+                    log.info(f"{id}---{xydm}----{tycid}----结束处理")
+                    break
+        except Exception as e:
+            log.info(f'==={social_code}=====获取企业信息失败==={e}=')
+            # 重新塞入redis
+            baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)