10/8

dd9d719d · 薛凌堃 · 222110f7 · dd9d719d · dd9d719d · dd9d719d
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -421,6 +421,7 @@ def NQEnterprise():
    nq_social_list = [item[0] for item in nq_result]
    for item in nq_social_list:
+        #新三板企业财务数据 上市信息 核心人员已采集  企业动态、企业公告未采集 企业公告脚本已开发，企业动态需要每天放入redis
        # r.rpush('NQEnterprise:nq_Ipo', item)
        r.rpush('NQEnterprise:nq_finance',item)
        # r.rpush('NQEnterprise:nq_notice',item)
@@ -451,11 +452,26 @@ def omeng():
 #单项冠军
 def danxiangguanjun():
-    pass
+    cnx, cursor = connectSql()
+    query = "SELECT CompanyName FROM champion"
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cnx.commit()
+    com_namelist = [item[0] for item in result]
+    for item in com_namelist:
+        r.rpush('champion:baseinfo',item)
 #科改示范
 def kegaishifan():
-    pass
+    cnx, cursor = connectSql()
+    query = "SELECT CompanyName FROM technological"
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cnx.commit()
+    com_namelist = [item[0] for item in result]
+    for item in com_namelist:
+        r.rpush('technological:baseinfo',item)
 #双百企业
 def shuangbaiqiye():
@@ -467,6 +483,8 @@ def zhuangjingtexind():
 if __name__ == "__main__":
    start = time.time()
+    # danxiangguanjun()
+    kegaishifan()
    # NoticeEnterprise()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()
@@ -477,7 +495,7 @@ if __name__ == "__main__":
    # FBS()
    # MengZhi()
    # NQEnterprise()
-    SEC_CIK()
+    # SEC_CIK()
    # omeng()
    # AnnualEnterpriseUS()
    # NoticeEnterprise_task()

--- a/comData/SEC_US/finance_us.py
+++ b/comData/SEC_US/finance_us.py
+"""
+解析json数据 两个链接：
+            https://data.sec.gov/api/xbrl/companyfacts/CIK0000320193.json 数据值和gaap字段
+            https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/MetaLinks.json html字段和gaap字段映射
+step1：拼接链接
+step2：
+"""
+import json
+import time
+import requests
+from kafka import KafkaProducer
+from operator import itemgetter
+from itertools import groupby
+from base.BaseCore import BaseCore
+# import urllib3
+# urllib3.disable_warings()
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+def fromcikgetinfo(cik):
+    query = f"select * from mgzqyjwyh_list where cik='{cik}' "
+    cursor.execute(query)
+    data = cursor.fetchone()
+    return data
+def getRequest(url):
+    headers = {
+        'Host': 'data.sec.gov',
+        'Connection': 'keep-alive',
+        'Cache-Control': 'max-age=0',
+        'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-User': '?1',
+        'Sec-Fetch-Dest': 'document',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cookie': '_ga=GA1.2.784424676.1695174651; _4c_=%7B%22_4c_s_%22%3A%22lZFLT4QwFIX%2FyqRrILS0pbAzmBgXajQ%2BlhNpLwOZcUoKDo4T%2Fru3gMbHym5ov55zcjk9kaGGPcmpzARNuVRcxElAtnDsSH4irjH%2BcyA50awsDTUq1ElShZwZCMuKmbASSQUUKsYoIwF5w6w0ZpmIpeBKqTEgul0yTkRbA5hFs4iqKA6rDh39OxKuYty2zppX3a%2F7Y%2BtlA5SrzmzxwsCh0bAeGtPX3s8m%2BUJraDZ1jzhlE22dl0QC90OzN3b47Vvol0%2BkFGnp7NCB9xa1sy%2BwolQitlgEeZocfloHFTg3yfDUNb0ftAMdbexhAVjezMKZPTaemtV9cYf8%2Bhu5LW6uFtT6jv0YO6ufdz4UnyUgF2frh8tz%2F2%2BKc8ZlKqPPpxKUjHPfCJiksRAZldhnvyO5kjz2a5yTp%2FrpTzVXWfZXPbcQ%2Bulh%2Fx%2FrOH4A%22%7D; _ga_300V1CHKH1=GS1.1.1695174651.1.1.1695174684.0.0.0; ak_bmsc=91C6D28D093861656DB8C1FC1972DAB6~000000000000000000000000000000~YAAQlQ8kF2U6orCKAQAAgyl9uxX8kNk3C77pkMi6N6RxnsUqDbYEmIcNjtLSa8W6kfGL9cQMRHBUaYcbEA1+oXsvUwUF80G8hmH/F4S0ZOEnVCrlcBLx219N24l2qmoSKtVDH+VKe7c1bji9MHc7tO2R56R7juZJv9gceAdtKEuArkPfD8ijx/TyEgIrM+XruGtzCRmLnfq86UoJYP+j+tXcaWkc/qm1zHDReDNf/cHd6h2aRMs4lsES8+uh6YTjE7bfCp8h2DNJ2e07pm0ojcI/kdycUPHmuTqWPdTBEjUybad31E1hRNBAE8PbGjy2lvlPY/piuN3HX3Q5ifsmTqCNJzynN2kjGm6i4SHhmEAijUeIzNQXB11GrVmALJVV6pEjd/uu; bm_sv=FD8981426EA388050697DFB615BAFFE3~YAAQ1wcsF5K72ZSKAQAAsvl/uxUw0do3nknGCkllXH27UZBpM7kQUXm4crBNTAkhek5YSDKIrrm2uFWidfpBfyxbRSr+w7FH7Y0w4cXMAa7BELzcc/B9Uf8T6e2I2W29wjurKkBFtSseslHSqYD3BWx9/GidJMW+dFNrlzNUMd1dONUR9J1TDnYifPhE6A/zSLPHVrCTJl7xzg7VlW/05Ay0i+Bo7TynZdWgotfjET3vg2/ZVixVSGaWeQo4~1'
+    }
+    for m in range(0,3):
+        try:
+            response = requests.get(url=url,headers=headers,verify=False)
+            break
+        except Exception as e:
+            log.error(f"request请求异常-------{e}")
+            continue
+    # 检查响应状态码
+    if response.status_code == 200:
+        jsonData = response.json()
+        return jsonData
+    else:
+        return False
+if __name__=='__main__':
+    taskType = '财务数据/SEC'
+    zcfzb_mapping = {
+        'AccountsAndOtherReceivablesNetCurrent':'指标1'
+    }
+    lrb_mapping = {
+    }
+    xjllb_mapping = {
+    }
+    while True:
+        start_time = time.time
+        # todo:从redis中获取企业cik
+        # cik = baseCore.redicPullData('sec_cik_US:uscik')
+        cik = '320193'
+        #通过cik去数据库中获取信息
+        data = fromcikgetinfo(cik)
+        com_name = data[2]
+        com_code = data[3]
+        exchange = data[4]
+        #拼接链接的cik是十位数
+        url_cik = cik
+        while True:
+            if len(url_cik) < 10:
+                url_cik = '0' + url_cik
+            else:
+                break
+        url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{url_cik}.json'
+        jsonData = getRequest(url)
+        if jsonData:
+            pass
+        print(jsonData)
+        try:
+            us_gaap = jsonData['facts']['us-gaap']
+        except:
+            continue
+        # 遍历map的key值
+        Listzcfzb = []
+        for key in zcfzb_mapping.keys():
+            # 一个财务指标的所有年份和金额
+            usd_list = us_gaap[key]['units']['USD']
+            # form: 10-K fp: FY
+            for j in usd_list:
+                form = usd_list[j]['form']
+                fp = usd_list[j]['fp']
+                if form=='10-K' and fp=='FY':
+                    pass
+                else:
+                    continue
+                date = usd_list[j]['end']
+                if date.endswith('03-31') or date.endswith('06-30') or date.endswith('09-30') or date.endswith('12-31'):
+                    pass
+                else:
+                    continue
+                val = usd_list[j]['val']
+                zcfzb_dic ={
+                    'zbname': key,
+                    'riqi': date,
+                    'jine': val,
+                    'fp': fp,
+                    'form': form
+                }
+                # 资产负债表所有年份指标
+                Listzcfzb.append(zcfzb_dic)
+        Listzcfzb.sort(key=itemgetter('riqi'))
+        groups = groupby(Listzcfzb, key=itemgetter('riqi'))
+        # 遍历每个分组，并打印分类结果
+        for riqi, group in groups:
+            print(f"riqi: {riqi}")
+            # 迭代表达式
+            listbydate = [item for item in group]
+            print()
--- a/comData/SEC_US/us_finance_.py
+++ b/comData/SEC_US/us_finance_.py
+"""从html页面中抽取表格"""
+import requests
+from bs4 import BeautifulSoup
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+def getRequest(url):
+    headers = {
+        'Referer': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/356037/000035603723000038/cspi-20230630x10q.htm',
+        'Sec-Ch-Ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31',
+           }
+    for m in range(0,3):
+        try:
+            response = requests.get(url=url,headers=headers,verify=False)
+            break
+        except Exception as e:
+            log.error(f"request请求异常-------{e}")
+            continue
+    # 检查响应状态码
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content,'html.parser')
+        return soup
+    else:
+        return False
+def getzcfztable(soup):
+    table_list = soup.find_all('table')
+    for table in table_list:
+        aa = table.find_all(text='Current assets:')
+        if aa:
+            # print(table)
+            trlist = table.find_all('tr')
+            date1 = trlist[1].find_all('td')[1].text.replace('\n', '')
+            date2 = trlist[1].find_all('td')[-1].text.replace('\n', '')
+            print(date1, date2)
+            # todo:把td内容为空的去掉
+            for tr in trlist[2:]:
+                filtered_tags = tr(lambda tag: tag.name == 'td' and '$' in tag.text)
+                for tag in filtered_tags:
+                    tag.extract()
+                # filtered_tags2 = tr(lambda tag:tag.name=='td' and tag.text==' ')
+                filtered_tags2 = tr(lambda tag: tag.name == 'td' and tag.text == '')
+                for tag in filtered_tags2:
+                    tag.extract()
+                try:
+                    zbtag = tr.find_all('td')[0].text.replace('\n', '')
+                except:
+                    zbtag = ''
+                try:
+                    cash1 = tr.find_all('td')[1].text.replace('\n', '')
+                except:
+                    cash1 = ''
+                try:
+                    cash2 = tr.find_all('td')[2].text.replace('\n', '')
+                except:
+                    cash2 = ''
+                if zbtag != '' and cash1 != '' and cash2 != '':
+                    print(f'字段:{zbtag}  值1:{cash1}  值2:{cash2}')
+if __name__=='__main__':
+    url = 'https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm'
+    soup = getRequest(url)
+    #html解析表格 资产负债表
+    getzcfztable(soup)
--- a/comData/newlist/champion/BaseCore.py
+++ b/comData/newlist/champion/BaseCore.py
--- a/comData/newlist/champion/baseinfo_champion.py
+++ b/comData/newlist/champion/baseinfo_champion.py
--- a/comData/newlist/champion/getQccId.py
+++ b/comData/newlist/champion/getQccId.py
+# -*- coding: utf-8 -*-
+import time
+from urllib.parse import quote
+import requests
+import urllib3
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# headers = {
+#         'Host': 'xcx.qcc.com',
+#         'Connection': 'keep-alive',
+#         'Qcc-Platform': 'mp-weixin',
+#         'Qcc-Timestamp': '',
+#         'Qcc-Version': '1.0.0',
+#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+#         'content-type': 'application/json',
+#         'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+#         'Accept-Encoding': 'gzip, deflate, br,'
+#     }
+headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'x-request-device-type': 'Android',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
+        'Content-Type': 'application/json',
+        'Qcc-Version': '1.0.0',
+        'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
+        'xweb_xhr': '1',
+        'xcx-version': '2023.09.27',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-CurrentPage': '/company-subpackages/business/index',
+        'Qcc-Timestamp': '1696661787803',
+        'Qcc-RefPage': '/company-subpackages/detail/index',
+        'Accept': '*/*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh'
+}
+# 通过企业名称或信用代码获取企查查id
+def find_id_by_name(start,token,name):
+    urllib3.disable_warnings()
+    qcc_key = name
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
+    for lll in range(1, 6):
+        try:
+            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+            break
+        except Exception as e:
+            print(f'{e}-------------重试')
+            time.sleep(5)
+            continue
+    time.sleep(2)
+    #{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频，请升级小程序版本'}
+    if resp_dict['status']==40101:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    if resp_dict['status']==401:
+        KeyNo = False
+        log.info(f'=======您的账号访问超频，请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    try:
+        if resp_dict['result']['Result']:
+            result_dict = resp_dict['result']['Result'][0]
+            KeyNo = result_dict['KeyNo']
+            Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            if Name == '':
+                KeyNo = 'null'
+        else:
+            KeyNo = 'null'
+    except:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
+        return KeyNo
+    log.info("{}，企业代码为:{}".format(qcc_key, KeyNo))
+    return KeyNo
\ No newline at end of file
--- a/comData/newlist/technological/BaseCore.py
+++ b/comData/newlist/technological/BaseCore.py
--- a/comData/newlist/technological/baseinfo_tech.py
+++ b/comData/newlist/technological/baseinfo_tech.py
--- a/comData/newlist/technological/getQccId.py
+++ b/comData/newlist/technological/getQccId.py
+# -*- coding: utf-8 -*-
+import time
+from urllib.parse import quote
+import requests
+import urllib3
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# headers = {
+#         'Host': 'xcx.qcc.com',
+#         'Connection': 'keep-alive',
+#         'Qcc-Platform': 'mp-weixin',
+#         'Qcc-Timestamp': '',
+#         'Qcc-Version': '1.0.0',
+#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+#         'content-type': 'application/json',
+#         'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+#         'Accept-Encoding': 'gzip, deflate, br,'
+#     }
+headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'x-request-device-type': 'Android',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
+        'Content-Type': 'application/json',
+        'Qcc-Version': '1.0.0',
+        'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
+        'xweb_xhr': '1',
+        'xcx-version': '2023.09.27',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-CurrentPage': '/company-subpackages/business/index',
+        'Qcc-Timestamp': '1696661787803',
+        'Qcc-RefPage': '/company-subpackages/detail/index',
+        'Accept': '*/*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh'
+}
+# 通过企业名称或信用代码获取企查查id
+def find_id_by_name(start,token,name):
+    urllib3.disable_warnings()
+    qcc_key = name
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
+    for lll in range(1, 6):
+        try:
+            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+            break
+        except Exception as e:
+            print(f'{e}-------------重试')
+            time.sleep(5)
+            continue
+    time.sleep(2)
+    #{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频，请升级小程序版本'}
+    if resp_dict['status']==40101:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    if resp_dict['status']==401:
+        KeyNo = False
+        log.info(f'=======您的账号访问超频，请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    try:
+        if resp_dict['result']['Result']:
+            result_dict = resp_dict['result']['Result'][0]
+            KeyNo = result_dict['KeyNo']
+            Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            if Name == '':
+                KeyNo = 'null'
+        else:
+            KeyNo = 'null'
+    except:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
+        return KeyNo
+    log.info("{}，企业代码为:{}".format(qcc_key, KeyNo))
+    return KeyNo
\ No newline at end of file
--- a/comData/noticeReport/证监会-公告.py
+++ b/comData/noticeReport/证监会-公告.py
 import json
@@ -5,7 +5,9 @@ import requests
 from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
@@ -16,7 +18,57 @@ cnx_ = baseCore.cnx_
 cursor_ = baseCore.cursor_
 taskType = '企业公告/证监会'
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
+def uptoOBS(pdf_url,pdf_name,type_id,social_code):
+    headers = {}
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
+               'full_path': '',
+               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    for i in range(0, 3):
+        try:
+            resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
+            break
+        except:
+            time.sleep(3)
+            continue
+    page_size = 0
+    for i in range(0, 3):
+        try:
+            name = pdf_name + '.pdf'
+            result = obsClient.putContent('zzsn', 'ZJH/'+name, content=resp_content)
+            with fitz.open(stream=resp_content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+            break
+        except:
+            time.sleep(3)
+            continue
+    if page_size < 1:
+        # pdf解析失败
+        # print(f'======pdf解析失败=====')
+        return retData
+    else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('/ZJH')[0]
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = result['Uploaded size']
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+        except:
+            return retData
+        return retData
 def secrchATT(item_id, name, type_id):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
@@ -164,16 +216,20 @@ def getUrl(code, url_parms, Catagory2_parms):
    return dic_parms
-def InsterInto(short_name, social_code, pdf_url):
+def ifInstert(short_name, social_code, pdf_url):
-    inster = False
+    ifexist = True
    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
+    #如果数据库中存在 则跳过
    if selects:
-        print(f'com_name:{short_name}、{pdf_url}已存在')
+        ifexist = False
-        return inster
+        log.info(f'com_name:{short_name}、{pdf_url}已存在')
+        return ifexist
+    else:
+        return ifexist
+def InsterInto(short_name, social_code, pdf_url):
    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
@@ -197,8 +253,8 @@ def InsterInto(short_name, social_code, pdf_url):
 def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
-    #上传至文件服务器
+    #上传至华为云服务器
-    retData = baseCore.upLoadToServe(pdf_url,8,social_code)
+    retData = uptoOBS(pdf_url,pdf_name,8,social_code)
    #附件插入att数据库
    if retData['state']:
        pass
@@ -323,10 +379,10 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            year = pub_time[:4]
            report_type = td_list[4].text.strip()
-            # 信息插入数据库
+            # 判断数据库中是否有该条资讯
-            insert = InsterInto(short_name, social_code, name_pdf)
+            ifexist = ifInstert(short_name, social_code, pdf_url)
+            #如果不存在 ifexist = True
-            if insert:
+            if ifexist:
                #     # 公告信息列表
                #     okCount = okCount + 1
                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败

--- a/研报/combin.py
+++ b/研报/combin.py
+import pandas as pd
+import pandas as pd
+import glob
+# 查找当前目录及其子目录下所有以.txt结尾的文件
+csv_files = glob.glob(r"D:\机械项目研报\机械项目研报*.xlsx", recursive=True)
+# 创建一个空的DataFrame用于存储合并后的数据
+merged_data = pd.DataFrame()
+# 逐个读取CSV文件并合并到DataFrame中
+for file in csv_files:
+    data = pd.read_excel(file,dtype=str)
+    # 去掉最后一列
+    # data = data.iloc[:, :-1]
+    dad=pd.DataFrame(data,dtype=str)
+    merged_data = merged_data.append(dad, ignore_index=True)
+sorted_df = merged_data.sort_values('industry')
+grouped = merged_data.groupby('industry')
+# 将合并后的数据保存到新的CSV文件中
+# merged_data.to_csv(r"D:\hg\tmp\11.csv", encoding='gbk', index=False, quoting=1, quotechar='"', escapechar='\\')
+# merged_data.to_excel(r"D:\机械项目研报\机械项目研报汇总.xlsx", index=False, engine='openpyxl')
+with pd.ExcelWriter(r'D:\机械项目研报\机械项目研报汇总2.xlsx') as writer:
+    for group_name, group_df in grouped:
+        group_df.to_excel(writer, sheet_name=group_name, index=False)
\ No newline at end of file