Merge remote-tracking branch 'origin/master'

63cac106 · LiuLiYuan · 87ecb399 · 93e304a2 · 63cac106 · 63cac106
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -5,22 +5,18 @@ import socket
 import sys
 import time
-import fitz
 import logbook
 import logbook.more
 import pandas as pd
 import requests
 import zhconv
-import pymysql
 import redis
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
-from openpyxl import Workbook
 import langid
 #创建连接池
 import pymysql
-from pymysql import connections
 from DBUtils.PooledDB import PooledDB
 # import sys
 # sys.path.append('D://zzsn_spider//base//fdfs_client')
@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB
 from fdfs_client.client import get_tracker_conf, Fdfs_client
 tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
 client = Fdfs_client(tracker_conf)
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
 class BaseCore:
@@ -659,12 +664,10 @@ class BaseCore:
            create_time = retData['create_time']
            order_by = num
            selects = self.secrchATT(item_id,year,type_id)
-            # sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
-            # self.cursor.execute(sel_sql, (item_id, year,type_id))
-            # selects = self.cursor.fetchone()
            if selects:
-                self.getLogger().info(f'com_name:{com_name}已存在')
+                self.getLogger().info(f'com_name:{com_name}--{year}已存在')
-                id = selects[0]
+                id = ''
                return id
            else:
                Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
@@ -695,6 +698,80 @@ class BaseCore:
            log = self.getLogger()
            log.info('======保存企业CIK失败=====')
+    #上传至obs华为云服务器，并解析破地方的内容和页数
+    # 获取文件大小
+    def convert_size(self,size_bytes):
+        # 定义不同单位的转换值
+        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+        i = 0
+        while size_bytes >= 1024 and i < len(units) - 1:
+            size_bytes /= 1024
+            i += 1
+        return f"{size_bytes:.2f} {units[i]}"
+    def obsexist(self,file_path):
+        # # 文件路径
+        # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
+        # 检查文件是否存在
+        response = obsClient.getObjectMetadata('zzsn', file_path)
+        if response.status >= 300:
+            self.getLogger().info('=====文件不存在obs=====')
+        else:
+            self.getLogger().info(f'=====文件存在obs========{file_path}')
+    def uptoOBS(self,pdf_url, name_pdf,type_id, social_code,pathType,taskType,start_time):
+        headers = {}
+        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
+                   'full_path': '',
+                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+                   'create_time': '', 'page_size': '', 'content': ''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+                file_size = int(response.headers.get('Content-Length'))
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                name = name_pdf + '.pdf'
+                now_time = time.strftime("%Y-%m")
+                result = obsClient.putContent('zzsn', f'{pathType}{now_time}/' + name, content=response.content)
+                with fitz.open(stream=response.content, filetype='pdf') as doc:
+                    page_size = doc.page_count
+                    for page in doc.pages():
+                        retData['content'] += page.get_text()
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            # print(f'======pdf解析失败=====')
+            return retData
+        else:
+            try:
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                retData['state'] = True
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
+                retData['full_path'] = unquote(result['body']['objectUrl'])
+                retData['file_size'] = self.convert_size(file_size)
+                retData['create_time'] = time_now
+                retData['page_size'] = page_size
+            except Exception as e:
+                state = 0
+                takeTime = self.getTimeCost(start_time, time.time())
+                self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+                return retData
+            return retData

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -475,7 +475,14 @@ def kegaishifan():
 #双百企业
 def shuangbaiqiye():
-    pass
+    cnx, cursor = connectSql()
+    query = "SELECT CompanyName FROM Hundred"
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cnx.commit()
+    com_namelist = [item[0] for item in result]
+    for item in com_namelist:
+        r.rpush('hundred:baseinfo', item)
 #专精特新
 def zhuangjingtexind():
@@ -484,7 +491,8 @@ def zhuangjingtexind():
 if __name__ == "__main__":
    start = time.time()
    # danxiangguanjun()
-    kegaishifan()
+    # kegaishifan()
+    shuangbaiqiye()
    # NoticeEnterprise()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()

--- a/comData/Tyc/tyc_qydt_add.py
+++ b/comData/Tyc/tyc_qydt_add.py
 import json
+import random
 import requests, time, pymysql
 import jieba
 import sys
@@ -45,24 +47,21 @@ def beinWork(tyc_code, social_code,start_time):
    retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
    t = time.time()
    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
-    for m in range(0, 3):
    try:
+        for m in range(0, 3):
            ip = baseCore.get_proxy()
            headers['User-Agent'] = baseCore.getRandomUserAgent()
            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
-            # time.sleep(random.randint(3, 5))
+            time.sleep(random.randint(3, 5))
            break
-        except Exception as e:
-            pass
        if (response.status_code == 200):
            pass
-    else:
+    except Exception as e:
        log.error(f"{tyc_code}-----获取总数接口失败")
-        e = '获取总数接口失败'
+        error = '获取总数接口失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
-        baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
        return retData
    try:
        json_1 = json.loads(response.content.decode('utf-8'))
@@ -177,7 +176,7 @@ def beinWork(tyc_code, social_code,start_time):
                    pass
                continue
            try:
-                insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+                insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
                # 动态信息列表
                up_okCount = up_okCount + 1
                list_info = [
@@ -185,6 +184,7 @@ def beinWork(tyc_code, social_code,start_time):
                    link,
                    '天眼查',
                    '2',
+                    time_format
                ]
                cursor_.execute(insert_sql, tuple(list_info))
                cnx_.commit()
@@ -214,10 +214,10 @@ def beinWork(tyc_code, social_code,start_time):
                }
            except Exception as e:
                log.info(f'传输失败:{social_code}----{link}')
-                e = '数据库传输失败'
+                error = '数据库传输失败'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
-                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
                continue
                # print(dic_news)
                # 将相应字段通过kafka传输保存

--- a/comData/annualReport/证监会-年报.py
+++ b/comData/annualReport/证监会-年报.py
 import json
@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf')
 client = Fdfs_client(tracker_conf)
 taskType = '企业年报/证监会'
+pathType = 'ZJHAnnualReport/'
 def RequestUrl(url, payload, item_id, start_time):
    # ip = get_proxy()[random.randint(0, 3)]
@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time):
    return soup
-def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
+# def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
-                create_by, create_time, page_size):
+#                 create_by, create_time, page_size):
+#
-    sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
+#     sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
-    cursor_.execute(sel_sql, (item_id, year))
+#     cursor_.execute(sel_sql, (item_id, year))
-    selects = cursor_.fetchone()
+#     selects = cursor_.fetchone()
-    if selects:
+#     if selects:
-        print(f'{name_pdf},{year}已存在')
+#         print(f'{name_pdf},{year}已存在')
+#
-    else:
+#     else:
-        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+#         Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+#
-        values = (
+#         values = (
-            year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
+#             year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
-            create_by,
+#             create_by,
-            create_time, page_size)
+#             create_time, page_size)
+#
-        cursor_.execute(Upsql, values)  # 插入
+#         cursor_.execute(Upsql, values)  # 插入
-        cnx.commit()  # 提交
+#         cnx.commit()  # 提交
-        print("更新完成:{}".format(Upsql))
+#         print("更新完成:{}".format(Upsql))
 # 采集信息
 def SpiderByZJH(url, payload, dic_info, num, start_time):
@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                cursor_.execute(sel_sql, (item_id, year))
                selects = cursor_.fetchone()
                if selects:
-                    print(f'com_name:{short_name}、{year}已存在')
+                    log.info(f'com_name:{short_name}、{year}已存在')
                    continue
                else:
-                    retData = baseCore.upLoadToServe(pdf_url, 1, social_code)
+                    retData = baseCore.uptoOBS(pdf_url,name_pdf, 1, social_code,pathType,taskType,start_time)
+                    if retData['state']:
+                        pass
+                    else:
+                        log.info(f'====pdf解析失败====')
+                        return False
                    #插入数据库获取att_id
                    num = num + 1
                    att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
-                    content = retData['content']
+                    if att_id:
-                    if retData['state']:
                        pass
                    else:
-                        log.info(f'====pdf解析失败====')
                        return False
+                    content = retData['content']
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    dic_news = {
                        'attachmentIds': att_id,
@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                            'message': '操作成功',
                            'code': '200',
                        }
-                        print(dic_result)
+                        log.info(dic_result)
                        return True
                    except Exception as e:
                        dic_result = {
@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
-                        print(dic_result)
+                        log.info(dic_result)
                        return False
            else:
                    continue
@@ -311,7 +317,8 @@ if __name__ == '__main__':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
-        count = dic_info[15]
+        count = dic_info[16]
+        log.info(f'====正在采集{social_code}=====')
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
        url_parms = ['101111', '101811', '102611']
@@ -322,7 +329,7 @@ if __name__ == '__main__':
        dic_parms = getUrl(code, url_parms, Catagory2_parms)
        SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
        end_time = time.time()
-        print(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
+        log.info(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
        count += 1
        runType = 'AnnualReportCount'
        baseCore.updateRun(social_code, runType, count)

--- a/comData/annualReport/雪球网-年报.py
+++ b/comData/annualReport/雪球网-年报.py
 # -*- coding: utf-8 -*-
@@ -152,24 +152,23 @@ def spider_annual_report(dict_info,num):
            cursor.execute(sel_sql, (social_code, int(year)))
            selects = cursor.fetchone()
            if selects:
-                print(f'com_name:{com_name}、{year}已存在')
+                log.info(f'com_name:{com_name}、{year}已存在')
                continue
            else:
-                page_size = 0
+                #上传文件至obs服务器
-                #上传文件至文件服务器
+                retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time)
-                retData = baseCore.upLoadToServe(pdf_url,1,social_code)
-                num = num + 1
-                try:
-                    att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
-                    content = retData['content']
                if retData['state']:
                    pass
                else:
                    log.info(f'====pdf解析失败====')
                    return False
+                num = num + 1
+                try:
+                    att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
+                    content = retData['content']
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '')
+                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
                except:
                    exception = '数据库传输失败'
                    state = 0
@@ -236,6 +235,7 @@ def spider_annual_report(dict_info,num):
 if __name__ == '__main__':
    num = 0
    taskType = '企业年报/雪球网'
+    pathType = 'XQWAnnualReport/'
    while True:
        start_time = time.time()
        # 获取企业信息

--- a/comData/caiwushuju/RedisPPData.py
+++ b/comData/caiwushuju/RedisPPData.py
@@ -14,6 +14,12 @@ def conn11():
    cursor = conn.cursor()
    return conn,cursor
+def conn144():
+    conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
+                           charset='utf8')
+    cursor = conn.cursor()
+    return conn,cursor
 #企业公告
 def shizhiCodeFromSql():
    conn,cursor=conn11()
@@ -31,6 +37,7 @@ def shizhiCodeFromSql():
    finally:
        cursor.close()
        conn.close()
 #企业公告
 def yahooCodeFromSql():
    conn,cursor=conn11()
@@ -49,6 +56,25 @@ def yahooCodeFromSql():
        cursor.close()
        conn.close()
+#新浪纽交所股票对应的代码
+def sinausstockCodeFromSql():
+    conn,cursor=conn144()
+    try:
+        gn_query = "select ticker from mgzqyjwyh_list where state=2 and exchange='NYSE'; "
+        cursor.execute(gn_query)
+        gn_result = cursor.fetchall()
+        gn_social_list = [item[0] for item in gn_result]
+        print('sinausstockCodeFromSql开始将股票代码放入redis=======')
+        for item in gn_social_list:
+            r.rpush('sina_usstock:securities_code', item)
+        print('sinausstockCodeFromSql将股票代码放入redis结束')
+    except Exception as e:
+        log.info("数据查询异常")
+    finally:
+        cursor.close()
+        conn.close()
 def yahooCode_task():
    # 实例化一个调度器
    scheduler = BlockingScheduler()
@@ -58,9 +84,12 @@ def yahooCode_task():
    scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
    # 每天执行一次
    scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0)
+    # 每天执行一次
+    scheduler.add_job(sinausstockCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
    try:
-        yahooCodeFromSql()  # 定时开始前执行一次
+        # yahooCodeFromSql()  # 定时开始前执行一次
-        shizhiCodeFromSql()  # 定时开始前执行一次
+        # shizhiCodeFromSql()  # 定时开始前执行一次
+        sinausstockCodeFromSql()  # 定时开始前执行一次
        scheduler.start()
    except Exception as e:
        print('定时采集异常', e)

--- a/comData/caiwushuju/YAHOO财务数据4.py
+++ b/comData/caiwushuju/YAHOO财务数据4.py
 # -*- coding: utf-8 -*-
@@ -373,6 +373,28 @@ class YahooCaiwu(object):
            currency=''
        return currency
+    #对比指标计算
+    def calculateIndexReq(self):
+        get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
+        try:
+            params={
+                'type':2
+            }
+            resp = requests.get(get_url,params=params)
+            print(resp.text)
+            text=json.loads(resp.text)
+            codee=text['code']
+            while codee==-200:
+                time.sleep(600)
+                resp = requests.get(get_url)
+                print(resp.text)
+                text=json.loads(resp.text)
+                codee=text['code']
+                if  codee==-200:
+                    break
+            print('调用接口成功！！')
+        except:
+            print('调用失败！')
 if __name__ == '__main__':
    # parse_excel()
    #get_content1()
@@ -383,8 +405,11 @@ if __name__ == '__main__':
            securitiescode=yahoo.getCodeFromRedis()
            yahoo.get_content2(securitiescode)
        except Exception as e:
+            print('没有数据暂停5分钟')
+            yahoo.calculateIndexReq()
            if securitiescode:
                yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode)
            else:
                time.sleep(300)
+                print('没有数据暂停5分钟')
--- a/comData/caiwushuju/sina_usstock财务.py
+++ b/comData/caiwushuju/sina_usstock财务.py
 import configparser
@@ -20,6 +20,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 from operator import itemgetter
 from itertools import groupby
 import datetime
+from decimal import Decimal
 class SinaUsstock(object):
@@ -54,13 +55,19 @@ class SinaUsstock(object):
            seriesValue=tddoc.find('td').text().split(' ')
            for i in range(0,len(pdate)):
                value=seriesValue[i]
+                try:
                    if '亿' in value:
-                    value = value.replace("亿", "*100000000")
+                        value = value.replace("亿", "").replace(",", "")
-                    value = eval(value)
+                        value = Decimal(value) * Decimal('100000000')
+                        # value = eval(value)
                    elif '万' in value:
-                    value = value.replace("万", "*10000")
+                        value = value.replace("万", "").replace(",", "")
-                    value = eval(value)
+                        value = Decimal(value) * Decimal('10000')
-                vvla=str(value)
+                        # value = eval(value)
+                except Exception as e:
+                    print(e)
+                    print(value)
+                vvla=str(value).replace(",", "")
                serisemsg={
                    'name':seriesName,
                    'value':vvla,
@@ -71,6 +78,31 @@ class SinaUsstock(object):
        return seriesList
+    # 判断股票代码是否存在
+    def check_code(self,com_code):
+        r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
+        res = r.exists('com_sinacaiwushuju_code::'+com_code)
+        #如果key存在 则不是第一次采集该企业， res = 1
+        if res:
+            return False  #表示不是第一次采集
+        else:
+            return True #表示是第一次采集
+    def check_date(self,com_code,info_date):
+        r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
+        res = r.sismember('com_sinacaiwushuju_code::'+com_code, info_date)  # 注意是 保存set的方式
+        if res:
+            return True
+        else:
+            return False
+    # 将采集后的股票代码对应的报告期保存进redis
+    def add_date(self,com_code,date_list):
+        r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
+        #遍历date_list 放入redis
+        for date in date_list:
+            res = r.sadd('com_sinacaiwushuju_code::'+com_code,date)
    def getCodeFromRedis(self):
        securitiescode=self.r.lpop('sina_usstock:securities_code')
        securitiescode = securitiescode.decode('utf-8')
@@ -209,7 +241,7 @@ class SinaUsstock(object):
                    #转换数据格式发送接口
                annualzb=zbl1+zbl3+zbl5
-                annualzb=self.groupZbData(annualzb,stock,social_credit_code,'annual')
+                annualzb=self.groupZbData(annualzb,stock,social_credit_code,'year')
                self.sendToFinance(annualzb)
                quarterzb=zbl2+zbl4+zbl6
                quarterzb=self.groupZbData(quarterzb,stock,social_credit_code,'quarter')
@@ -228,15 +260,26 @@ class SinaUsstock(object):
    def sendToFinance(self,zbmsg):
            for zbb in zbmsg:
+                com_code=zbb['securitiesCode']
+                com_date=zbb['date']
+                #判断股票代码是否采集过
+                if self.check_code(com_code):
+                    zbb['ynFirst']=True
                if len(zbb) != 0:
                    # 调凯歌接口存储数据
                    data = json.dumps(zbb)
                    #暂无接口
-                    url_baocun = ''
+                    url_baocun = 'http://114.115.236.206:8088/sync/finance/sina'
                    # url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
                    for nnn in range(0, 3):
                        try:
                            res_baocun = requests.post(url_baocun, data=data)
+                            #将采集到的股票代码和日期进行记录用来标记是否采集过
+                            com_date_list=[]
+                            com_date_list.append(com_date)
+                            self.add_date(com_code,com_date)
                            self.logger.info(res_baocun.text)
                            break
                        except:
@@ -309,7 +352,7 @@ class SinaUsstock(object):
 if __name__ == '__main__':
    sinaUsstock=SinaUsstock()
    # securitiescode= sinaUsstock.r.lpop('sina_usstock:securities_code')
-    securitiescode= sinaUsstock.getCodeFromRedis()
+    # securitiescode= sinaUsstock.getCodeFromRedis()
    securitiescode='AAPL'
    try:
        sinaUsstock.get_content2(securitiescode)

--- a/comData/newlist/champion/BaseCore.py
+++ b/comData/newlist/champion/BaseCore.py
@@ -541,7 +541,10 @@ class BaseCore:
        self.cursor.execute(query)
        token_list = self.cursor.fetchall()
        self.cnx.commit()
+        try:
            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
        return token
    # 删除失效的token

--- a/comData/newlist/hundred/BaseCore.py
+++ b/comData/newlist/hundred/BaseCore.py
+# 核心工具包
+import os
+import random
+import socket
+import sys
+import time
+import fitz
+import logbook
+import logbook.more
+import pandas as pd
+import requests
+import zhconv
+import pymysql
+import redis
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from openpyxl import Workbook
+import langid
+#创建连接池
+import pymysql
+from pymysql import connections
+from DBUtils.PooledDB import PooledDB
+# import sys
+# sys.path.append('D://zzsn_spider//base//fdfs_client')
+from fdfs_client.client import get_tracker_conf, Fdfs_client
+tracker_conf = get_tracker_conf('/base/client.conf')
+client = Fdfs_client(tracker_conf)
+# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
+class BaseCore:
+    # 序列号
+    __seq = 0
+    # 代理池 数据库连接
+    # __cnx_proxy =None
+    # __cursor_proxy = None
+    cnx = None
+    cursor = None
+    cnx_ = None
+    cursor_ = None
+    r = None
+    # agent 池
+    __USER_AGENT_LIST = [
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
+        'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
+        'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
+        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
+        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
+        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
+        'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
+        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+        'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
+        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
+        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
+        'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
+        'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
+        'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
+        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
+        'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
+        'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
+        'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
+    ]
+    #Android agent池
+    __USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
+    def __init__(self):
+        # self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
+        #                                    charset='utf8mb4')
+        # self.__cursor_proxy = self.__cnx_proxy.cursor()
+        self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                                   charset='utf8mb4')
+        self.cursor = self.cnx.cursor()
+        #11数据库
+        self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
+                               charset='utf8mb4')
+        self.cursor_ = self.cnx_.cursor()
+        # 连接到Redis
+        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+        self.pool_caiji = PooledDB(
+            creator=pymysql,
+            maxconnections=5,
+            mincached=2,
+            maxcached=5,
+            blocking=True,
+            host='114.115.159.144',
+            port=3306,
+            user='caiji',
+            password='zzsn9988',
+            database='caiji',
+            charset='utf8mb4'
+        )
+    def close(self):
+        try:
+            self.cursor.close()
+            self.cnx.close()
+        except :
+            pass
+    # 计算耗时
+    def getTimeCost(self,start, end):
+        seconds = int(end - start)
+        m, s = divmod(seconds, 60)
+        h, m = divmod(m, 60)
+        if (h > 0):
+            return "%d小时%d分钟%d秒" % (h, m, s)
+        elif (m > 0):
+            return "%d分钟%d秒" % (m, s)
+        elif (seconds > 0):
+            return "%d秒" % (s)
+        else:
+            ms = int((end - start) * 1000)
+            return "%d毫秒" % (ms)
+    # 当前时间格式化
+    # 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
+    # 2 : 010101120000 %y%m%d%H%M%S
+    # 时间戳 3:1690179526555  精确到秒
+    def getNowTime(self, type):
+        now_time = ""
+        if type == 1:
+            now_time = time.strftime("%Y-%m-%d %H:%M:%S")
+        if type == 2:
+            now_time = time.strftime("%y%m%d%H%M%S")
+        if type == 3:
+            now_time = int(time.time() * 1000)
+        return now_time
+    # 获取流水号
+    def getNextSeq(self):
+        self.__seq += 1
+        if self.__seq > 1000:
+            self.__seq = 0
+        return self.getNowTime(2) + str(self.__seq).zfill(3)
+    # 获取信用代码
+    def getNextXydm(self):
+        self.__seq += 1
+        if self.__seq > 1000:
+            self.__seq = 0
+        return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
+    # 日志格式
+    def logFormate(self,record, handler):
+        formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
+            date=record.time,  # 日志时间
+            level=record.level_name,  # 日志等级
+            filename=os.path.split(record.filename)[-1],  # 文件名
+            func_name=record.func_name,  # 函数名
+            lineno=record.lineno,  # 行号
+            msg=record.message  # 日志内容
+        )
+        return formate
+    # 获取logger
+    def getLogger(self,fileLogFlag=True, stdOutFlag=True):
+        dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
+        dirname = os.path.join(dirname, "logs")
+        filename = filename.replace(".py", "") + ".log"
+        if not os.path.exists(dirname):
+            os.mkdir(dirname)
+        logbook.set_datetime_format('local')
+        logger = logbook.Logger(filename)
+        logger.handlers = []
+        if fileLogFlag:  # 日志输出到文件
+            logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
+                                                       bubble=True, encoding='utf-8')
+            logFile.formatter = self.logFormate
+            logger.handlers.append(logFile)
+        if stdOutFlag:  # 日志打印到屏幕
+            logStd = logbook.more.ColorizedStderrHandler(bubble=True)
+            logStd.formatter = self.logFormate
+            logger.handlers.append(logStd)
+        return logger
+    # 获取随机的userAgent
+    def getRandomUserAgent(self):
+        return random.choice(self.__USER_AGENT_LIST)
+    # 获取代理
+    def get_proxy(self):
+        sql = "select proxy from clb_proxy"
+        self.cursor.execute(sql)
+        proxy_lists = self.cursor.fetchall()
+        ip_list = []
+        for proxy_ in proxy_lists:
+            ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+        proxy_list = []
+        for str_ip in ip_list:
+            str_ip_list = str_ip.split('-')
+            proxyMeta = "http://%(host)s:%(port)s" % {
+                "host": str_ip_list[0],
+                "port": str_ip_list[1],
+            }
+            proxy = {
+                "HTTP": proxyMeta,
+                "HTTPS": proxyMeta
+            }
+            proxy_list.append(proxy)
+        return proxy_list[random.randint(0, 3)]
+    #字符串截取
+    def getSubStr(self,str,beginStr,endStr):
+        if beginStr=='':
+            pass
+        else:
+            begin=str.rfind(beginStr)
+            if begin==-1:
+                begin=0
+            str=str[begin:]
+        if endStr=='':
+            pass
+        else:
+            end=str.rfind(endStr)
+            if end==-1:
+                pass
+            else:
+                str = str[0:end+1]
+        return str
+    # 繁体字转简体字
+    def hant_2_hans(self,hant_str: str):
+        '''
+        Function: 将 hant_str 由繁体转化为简体
+        '''
+        return zhconv.convert(hant_str, 'zh-hans')
+    # 判断字符串里是否含数字
+    def str_have_num(self,str_num):
+        panduan = False
+        for str_1 in str_num:
+            ppp = str_1.isdigit()
+            if ppp:
+                panduan = ppp
+        return panduan
+    # # 从Redis的List中获取并移除一个元素
+    # def redicPullData(self,type,key):
+    # #1 表示国内 2 表示国外
+    #     if type == 1:
+    #         gn_item = self.r.lpop(key)
+    #         return gn_item.decode() if gn_item else None
+    #     if type == 2:
+    #         gw_item = self.r.lpop(key)
+    #         return gw_item.decode() if gw_item else None
+    # 从Redis的List中获取并移除一个元素
+    def redicPullData(self,key):
+        item = self.r.lpop(key)
+        return item.decode() if item else None
+    # 获得脚本进程PID
+    def getPID(self):
+        PID = os.getpid()
+        return PID
+    # 获取本机IP
+    def getIP(self):
+        IP = socket.gethostbyname(socket.gethostname())
+        return IP
+    def mkPath(self,path):
+        folder = os.path.exists(path)
+        if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
+            os.makedirs(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
+        else:
+            pass
+    # 生成google模拟浏览器  必须传入值为googledriver位置信息
+    # headless用于决定是否为无头浏览器,初始默认为无头浏览器
+    # 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
+    # 无头浏览器用于后续对信息采集时不会有浏览器一直弹出，
+    def buildDriver(self, path, headless=True):
+        service = Service(path)
+        chrome_options = webdriver.ChromeOptions()
+        if headless:
+            chrome_options.add_argument('--headless')
+            chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_experimental_option(
+            "excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option('useAutomationExtension', False)
+        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+        chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
+        # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
+        driver = webdriver.Chrome(options=chrome_options, service=service)
+        # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
+        #     js = f.read()
+        #
+        # driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        #     "source": js
+        # })
+        return driver
+    # 根据社会信用代码获取企业信息
+    def getInfomation(self, com_name):
+        data = []
+        try:
+            sql = f"SELECT * FROM Hundred WHERE CompanyName = '{com_name}'"
+            # self.cursor.execute(sql)
+            # data = self.cursor.fetchone()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            data = cursor.fetchone()
+            conn.commit()
+            data = list(data)
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('=========数据库操作失败========')
+        return data
+    # 更新企业采集次数
+    def updateRun(self, social_code, runType, count):
+        try:
+            sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
+            # self.cursor.execute(sql_update)
+            # self.cnx.commit()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql_update)
+            conn.commit()
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('======更新数据库失败======')
+    # 保存日志入库
+    def recordLog(self, xydm, taskType, state, takeTime, url, e):
+        try:
+            createTime = self.getNowTime(1)
+            ip = self.getIP()
+            pid = self.getPID()
+            sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+            values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
+            # try:
+            #     self.cursor.execute(sql, values)
+            # except Exception as e:
+            #     print(e)
+            # self.cnx.commit()
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql,values)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存日志失败=====')
+    #获取企查查token
+    def GetToken(self):
+        #获取企查查token
+        query = "select token from QCC_token "
+        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
+        self.cursor.execute(query)
+        token_list = self.cursor.fetchall()
+        self.cnx.commit()
+        try:
+            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
+        return token
+    # 删除失效的token
+    def delete_token(self,token):
+        deletesql = f"delete from QCC_token where token='{token}' "
+        self.cursor.execute(deletesql)
+        self.cnx.commit()
+    #获取天眼查token
+    def GetTYCToken(self):
+        query = 'select token from TYC_token'
+        self.cursor.execute(query)
+        token = self.cursor.fetchone()[0]
+        self.cnx.commit()
+        return token
+    #检测语言
+    def detect_language(self, text):
+        # 使用langid.py判断文本的语言
+        result = langid.classify(text)
+        if result == '':
+            return 'cn'
+        if result[0] == '':
+            return 'cn'
+        return result[0]
+    #追加接入excel
+    def writerToExcel(self,detailList,filename):
+        # filename='baidu搜索.xlsx'
+        # 读取已存在的xlsx文件
+        existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
+        # 创建新的数据
+        new_data = pd.DataFrame(data=detailList)
+        # 将新数据添加到现有数据的末尾
+        combined_data = existing_data.append(new_data, ignore_index=True)
+        # 将结果写入到xlsx文件
+        combined_data.to_excel(filename, index=False)
+        # return combined_data
+    #对失败或者断掉的企业 重新放入redis
+    def rePutIntoR(self,key,item):
+        self.r.rpush(key, item)
+    #增加计数器的值并返回增加后的值
+    def incrSet(self,key):
+        # 增加计数器的值并返回增加后的值
+        new_value = self.r.incr(key)
+        print("增加后的值：", new_value)
+        return new_value
+    #获取key剩余的过期时间
+    def getttl(self,key):
+        # 获取key的剩余过期时间
+        ttl = self.r.ttl(key)
+        print("剩余过期时间：", ttl)
+        # 判断key是否已过期
+        if ttl < 0:
+            # key已过期，将key的值重置为0
+            self.r.set(key, 0)
+            self.r.expire(key, 3600)
+            time.sleep(2)
+    #上传至文件服务器,并解析pdf的内容和页数
+    def upLoadToServe(self,pdf_url,type_id,social_code):
+        headers = {}
+        retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
+                   'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
+                   'create_time':'','page_size':'','content':''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
+                with fitz.open(stream=resp_content, filetype='pdf') as doc:
+                    page_size = doc.page_count
+                    for page in doc.pages():
+                        retData['content'] += page.get_text()
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            print(f'======pdf解析失败=====')
+            return retData
+        else:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
+            retData['full_path'] = bytes.decode(result['Remote file_id'])
+            retData['file_size'] = result['Uploaded size']
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+            return retData
+    def secrchATT(self,item_id,year,type_id):
+        sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
+        self.cursor_.execute(sel_sql, (item_id, year, type_id))
+        selects = self.cursor_.fetchone()
+        return selects
+    #插入到att表 返回附件id
+    def tableUpdate(self,retData,com_name,year,pdf_name,num):
+            item_id = retData['item_id']
+            type_id = retData['type_id']
+            group_name = retData['group_name']
+            path = retData['path']
+            full_path = retData['full_path']
+            category = retData['category']
+            file_size = retData['file_size']
+            status = retData['status']
+            create_by = retData['create_by']
+            page_size = retData['page_size']
+            create_time = retData['create_time']
+            order_by = num
+            selects = self.secrchATT(item_id,year,type_id)
+            # sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
+            # self.cursor.execute(sel_sql, (item_id, year,type_id))
+            # selects = self.cursor.fetchone()
+            if selects:
+                self.getLogger().info(f'com_name:{com_name}已存在')
+                id = selects[0]
+                return id
+            else:
+                Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+                values = (
+                    year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+                    status, create_by,
+                    create_time, page_size)
+                self.cursor_.execute(Upsql, values)  # 插入
+                self.cnx_.commit()  # 提交
+                self.getLogger().info("更新完成:{}".format(Upsql))
+                selects = self.secrchATT(item_id,year,type_id)
+                id = selects[0]
+                return id
+    # 更新企业的CIK
+    def updateCIK(self,social_code,cik):
+        try:
+            sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存企业CIK失败=====')
--- a/comData/newlist/hundred/baseinfo_hundred.py
+++ b/comData/newlist/hundred/baseinfo_hundred.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import time
+import requests
+import json
+from kafka import KafkaProducer
+from BaseCore import BaseCore
+from getQccId import find_id_by_name
+baseCore = BaseCore()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+log = baseCore.getLogger()
+# 通过企查查id获取企业基本信息
+def info_by_id(com_id,com_name):
+    aa_dict_list = []
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
+    resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(2)
+    com_jc_name = ''
+    try:
+        result_dict = resp_dict['result']['Company']
+    except:
+        log.info(com_name + ":获取失败===========重新放入redis")
+        baseCore.rePutIntoR('hundred:baseinfo',com_name)
+        return aa_dict_list
+    company_name = result_dict['Name']
+    CreditCode = result_dict['CreditCode']
+    if CreditCode is None:
+        CreditCode = ''
+    try:
+        OperName = result_dict['Oper']['Name']
+    except:
+        OperName = ''
+    if OperName is None:
+        OperName = ''
+    if baseCore.str_have_num(OperName):
+        OperName = ''
+    try:
+        Status = result_dict['ShortStatus']
+    except:
+        Status = ''
+    if Status is None:
+        Status = ''
+    try:
+        StartDate = result_dict['StartDate']
+    except:
+        StartDate = ''
+    if StartDate is None:
+        StartDate = ''
+    try:
+        RegistCapi = result_dict['RegistCapi']
+    except:
+        RegistCapi = ''
+    if RegistCapi is None:
+        RegistCapi = ''
+    RecCap = ''  # result_dict['RecCap']  #实际缴纳金额，现已没有显示
+    if RecCap is None:
+        RecCap = ''
+    try:
+        OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2]  # 组织机构代码，现已没有显示
+    except:
+        OrgNo = ''
+    if OrgNo is None:
+        OrgNo = ''
+    try:
+        TaxNo = result_dict['TaxNo']
+    except:
+        TaxNo = ''
+    if TaxNo is None:
+        TaxNo = ''
+    try:
+        EconKind = result_dict['EconKind']
+    except:
+        EconKind = ''
+    if EconKind is None:
+        EconKind = ''
+    TermStart = ''  # result_dict['TermStart']  营业期限自，现已没有显示
+    if TermStart is None:
+        TermStart = ''
+    TeamEnd = ''  # result_dict['TeamEnd']营业期限至，现已没有显示
+    if TeamEnd is None:
+        TeamEnd = ''
+    try:
+        SubIndustry = result_dict['Industry']['SubIndustry']
+    except:
+        SubIndustry = ''
+    if SubIndustry is None:
+        SubIndustry = ''
+    try:
+        Province = result_dict['Area']['Province']
+    except:
+        Province = ''
+    try:
+        City = result_dict['Area']['City']
+    except:
+        City = ''
+    try:
+        County = result_dict['Area']['County']
+    except:
+        County = ''
+    try:
+        region = Province + City + County
+    except:
+        region = ''
+    BelongOrg = ''  # result_dict['BelongOrg']登记机关，现已没有显示
+    can_bao = ''
+    CommonList = []  # result_dict['CommonList']参保人数，现已没有显示
+    for Common_dict in CommonList:
+        try:
+            KeyDesc = Common_dict['KeyDesc']
+        except:
+            continue
+        if KeyDesc == '参保人数':
+            can_bao = Common_dict['Value']
+    if can_bao == '0':
+        can_bao = ''
+    OriginalName = ''
+    try:
+        OriginalName_lists = result_dict['OriginalName']
+        for OriginalName_dict in OriginalName_lists:
+            OriginalName += OriginalName_dict['Name'] + ' '
+    except:
+        OriginalName = ''
+    try:
+        OriginalName.strip()
+    except:
+        OriginalName = ''
+    EnglishName = ''  # result_dict['EnglishName']企业英文名，现已没有显示
+    if EnglishName is None:
+        EnglishName = ''
+    IxCode = ''  # result_dict['IxCode']进出口企业代码，现已没有显示
+    if IxCode is None:
+        IxCode = ''
+    Address = result_dict['Address']
+    if Address is None:
+        Address = ''
+    Scope = ''  # result_dict['Scope']经营范围，现已没有显示
+    if Scope is None:
+        Scope = ''
+    try:
+        PhoneNumber = result_dict['companyExtendInfo']['Tel']
+    except:
+        PhoneNumber = ''
+    if PhoneNumber is None:
+        PhoneNumber = ''
+    try:
+        WebSite = result_dict['companyExtendInfo']['WebSite']
+    except:
+        WebSite = None
+    if WebSite is None:
+        try:
+            WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
+        except:
+            WebSite = ''
+    try:
+        Email = result_dict['companyExtendInfo']['Email']
+    except:
+        Email = ''
+    if Email is None:
+        Email = ''
+    try:
+        Desc = result_dict['companyExtendInfo']['Desc']
+    except:
+        Desc = ''
+    if Desc is None:
+        Desc = ''
+    try:
+        Info = result_dict['companyExtendInfo']['Info']
+    except:
+        Info = ''
+    if Info is None:
+        Info = ''
+    company_name = baseCore.hant_2_hans(company_name)
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
+                                                                                                         com_id)
+    resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(1)
+    try:
+        com2 = resp_dict2['result']['Company']
+    except:
+        com2 = ''
+    try:
+        Scope = com2['Scope']
+    except:
+        Scope = ''
+    try:
+        CheckDate = com2['CheckDate']
+    except:
+        CheckDate = ''
+    if CheckDate is None:
+        CheckDate = ''
+    try:
+        TaxpayerType = com2['TaxpayerType']     #纳税人资质
+    except:
+        TaxpayerType = ''
+    if TaxpayerType is None:
+        TaxpayerType = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    if No is None:
+        No = ''
+    try:
+        IxCode = com2['IxCode']
+    except:
+        IxCode = ''
+    try:
+        OrgNo = com2['OrgNo']
+    except:
+        OrgNo = ''
+    try:
+        for Common_t in com2['CommonList']:
+            try:
+                if Common_t['KeyDesc'] == '参保人数':
+                    can_bao = Common_t['Value']
+            except:
+                pass
+    except:
+        can_bao = ''
+    try:
+        TermStart = com2['TermStart']
+    except:
+        TermStart = ''
+    try:
+        TeamEnd = com2['TeamEnd']
+    except:
+        TeamEnd = ''
+    try:
+        RecCap = com2['RecCap']
+    except:
+        RecCap = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    try:
+        SubIndustry = com2['IndustryArray'][-1]
+    except:
+        SubIndustry = ''
+    try:
+        BelongOrg = com2['BelongOrg']
+    except:
+        BelongOrg = ''
+    try:
+        EnglishName = com2['EnglishName']
+    except:
+        EnglishName = ''
+    aa_dict = {
+        'qccId': com_id,  # 企查查企业id
+        'name': company_name,  # 企业名称
+        'shortName': com_jc_name,  # 企业简称
+        'socialCreditCode': CreditCode,  # 统一社会信用代码
+        'legalPerson': OperName,  # 法定代表人
+        'officialPhone': PhoneNumber,  # 电话
+        'officialUrl': WebSite,  # 官网
+        'officialEmail': Email,  # 邮箱
+        'briefInfo': Desc,  # 简介
+        'registerStatus': Status,  # 登记状态
+        'incorporationDate': StartDate,  # 成立日期
+        'capital': RegistCapi,  # 注册资本
+        'paidCapital': RecCap,  # 实缴资本
+        'approvalDate': CheckDate,  # 核准日期
+        'organizationCode': OrgNo,  # 组织机构代码
+        'registerNo': No,  # 工商注册号
+        'taxpayerNo': CreditCode,  # 纳税人识别号
+        'type': EconKind,  # 企业类型
+        'businessStartDate': TermStart,  # 营业期限自
+        'businessEndDate': TeamEnd,  # 营业期限至
+        'taxpayerQualification': TaxpayerType,  # 纳税人资质
+        'industry': SubIndustry,  # 所属行业
+        'region': region,
+        'province': Province,  # 所属省
+        'city': City,  # 所属市
+        'county': County,  # 所属县
+        'registerDepartment': BelongOrg,  # 登记机关
+        'scale': Info,  # 人员规模
+        'insured': can_bao,  # 参保人数
+        'beforeName': OriginalName,  # 曾用名
+        'englishName': EnglishName,  # 英文名
+        'importExportEnterpriseCode': IxCode,  # 进出口企业代码
+        'address': Address,  # 地址
+        'businessRange': Scope,  # 经营范围
+        'status': 0,  # 状态
+    }
+    aa_dict_list.append(aa_dict)
+    log.info(company_name + "：爬取完成")
+    return aa_dict_list
+if __name__ == '__main__':
+    taskType = '基本信息/企查查/单项双百企业冠军'
+    headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-Timestamp': '',
+        'Qcc-Version': '1.0.0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+        'content-type': 'application/json',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br,'
+    }
+    list_weicha = []
+    name_list = []
+    #从redis里拿数据
+    while True:
+        # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
+        token = baseCore.GetToken()
+        if token:
+            pass
+        else:
+            log.info('==========已无token==========')
+            time.sleep(30)
+            continue
+        # list_all_info = []
+        start_time = time.time()
+        # 获取企业信息
+        com_name = baseCore.redicPullData('hundred:baseinfo')
+        # com_name = '卓新市万达铸业有限公司'
+        if com_name == '' or com_name is None:
+            time.sleep(20)
+            continue
+        dic_info = baseCore.getInfomation(com_name)
+        log.info(f'----当前企业{com_name}--开始处理---')
+        social_code = dic_info[5]
+        #企查查id
+        company_id = dic_info[6]
+        #如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
+        if company_id == None:
+            if social_code:
+                company_id = find_id_by_name(start_time,token,social_code)
+            else:
+                company_id = find_id_by_name(start_time,token,com_name)
+            if company_id == 'null':
+                log.info('=====搜索不到该企业====')
+                #todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
+                baseCore.rePutIntoR('hundred:baseinfo', com_name + '：搜索不到')
+                continue
+            if not company_id:
+                log.info(com_name + "：企业ID获取失败===重新放入redis")
+                list_weicha.append(com_name + "：企业ID获取失败")
+                baseCore.rePutIntoR('hundred:baseinfo',com_name)
+                baseCore.delete_token(token)
+                log.info('=====已重新放入redis,失效token已删除======')
+                time.sleep(20)
+                continue
+            else:
+                log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
+                # todo:写入数据库
+                updateqccid = f"update Hundred set qccid = '{company_id}' where CompanyName = '{com_name}'"
+                cursor_.execute(updateqccid)
+                cnx_.commit()
+        try:
+            post_data_list = info_by_id(company_id, com_name)
+        except:
+            log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
+            baseCore.rePutIntoR('hundred:baseInfo', com_name)
+            baseCore.delete_token(token)
+            log.info('=====已重新放入redis,失效token已删除======')
+            continue
+        if post_data_list:
+            pass
+        else:
+            # log.info(f'======{social_code}====企查查token失效====')
+            time.sleep(20)
+            continue
+        for post_data in post_data_list:
+            # list_all_info.append(post_data)
+            if post_data is None:
+                print(com_name + "：企业信息获取失败")
+                list_weicha.append(com_name + "：企业信息获取失败")
+                continue
+            get_name = post_data['name']
+            get_socialcode = post_data['socialCreditCode']
+            #todo:将信用代码更新到表中
+            updatesocialcode = f"update Hundred set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
+            cursor_.execute(updatesocialcode)
+            cnx_.commit()
+            name_compile = {
+                'yuan_name':com_name,
+                'get_name':get_name
+            }
+            name_list.append(name_compile)
+            log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
+                kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
+                print(kafka_result.get(timeout=10))
+            except:
+                exception = 'kafka传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
+                log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
+        # break
+    nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
+    companyName = pd.DataFrame(name_list)
+    companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
+    false_com = pd.DataFrame(list_weicha)
+    false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
--- a/comData/newlist/hundred/getQccId.py
+++ b/comData/newlist/hundred/getQccId.py
+# -*- coding: utf-8 -*-
+import time
+from urllib.parse import quote
+import requests
+import urllib3
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# headers = {
+#         'Host': 'xcx.qcc.com',
+#         'Connection': 'keep-alive',
+#         'Qcc-Platform': 'mp-weixin',
+#         'Qcc-Timestamp': '',
+#         'Qcc-Version': '1.0.0',
+#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+#         'content-type': 'application/json',
+#         'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+#         'Accept-Encoding': 'gzip, deflate, br,'
+#     }
+headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'x-request-device-type': 'Android',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
+        'Content-Type': 'application/json',
+        'Qcc-Version': '1.0.0',
+        'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
+        'xweb_xhr': '1',
+        'xcx-version': '2023.09.27',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-CurrentPage': '/company-subpackages/business/index',
+        'Qcc-Timestamp': '1696661787803',
+        'Qcc-RefPage': '/company-subpackages/detail/index',
+        'Accept': '*/*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh'
+}
+# 通过企业名称或信用代码获取企查查id
+def find_id_by_name(start,token,name):
+    urllib3.disable_warnings()
+    qcc_key = name
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
+    for lll in range(1, 6):
+        try:
+            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+            break
+        except Exception as e:
+            print(f'{e}-------------重试')
+            time.sleep(5)
+            continue
+    time.sleep(2)
+    #{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频，请升级小程序版本'}
+    if resp_dict['status']==40101:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    if resp_dict['status']==401:
+        KeyNo = False
+        log.info(f'=======您的账号访问超频，请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    try:
+        if resp_dict['result']['Result']:
+            result_dict = resp_dict['result']['Result'][0]
+            KeyNo = result_dict['KeyNo']
+            Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            if Name == '':
+                KeyNo = 'null'
+        else:
+            KeyNo = 'null'
+    except:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
+        return KeyNo
+    log.info("{}，企业代码为:{}".format(qcc_key, KeyNo))
+    return KeyNo
\ No newline at end of file
--- a/comData/newlist/technological/BaseCore.py
+++ b/comData/newlist/technological/BaseCore.py
@@ -541,7 +541,10 @@ class BaseCore:
        self.cursor.execute(query)
        token_list = self.cursor.fetchall()
        self.cnx.commit()
+        try:
            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
        return token
    # 删除失效的token

--- a/comData/noticeReport/证监会-公告.py
+++ b/comData/noticeReport/证监会-公告.py
 import json
@@ -18,12 +18,23 @@ cnx_ = baseCore.cnx_
 cursor_ = baseCore.cursor_
 taskType = '企业公告/证监会'
 obsClient = ObsClient(
        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )
+#获取文件大小
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units)-1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
 def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
@@ -33,7 +44,8 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
-            resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
+            response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
+            file_size = int(response.headers.get('Content-Length'))
            break
        except:
            time.sleep(3)
@@ -42,8 +54,9 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    for i in range(0, 3):
        try:
            name = pdf_name + '.pdf'
-            result = obsClient.putContent('zzsn', 'ZJH/'+name, content=resp_content)
+            now_time = time.strftime("%Y-%m")
-            with fitz.open(stream=resp_content, filetype='pdf') as doc:
+            result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
                page_size = doc.page_count
                for page in doc.pages():
                    retData['content'] += page.get_text()
@@ -60,23 +73,25 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
-            retData['path'] = result['body']['objectUrl'].split('/ZJH')[0]
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
-            retData['file_size'] = result['Uploaded size']
+            retData['file_size'] = convert_size(file_size)
            retData['create_time'] = time_now
            retData['page_size'] = page_size
-        except:
+        except Exception as e:
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData
        return retData
-def secrchATT(item_id, name, type_id):
+def secrchATT(item_id, name, type_id,order_by):
-    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
-    cursor_.execute(sel_sql, (item_id, name, type_id))
+    cursor_.execute(sel_sql, (item_id, name, type_id,order_by))
    selects = cursor_.fetchone()
    return selects
 # 插入到att表 返回附件id
 def tableUpdate(retData, com_name, year, pdf_name, num):
    item_id = retData['item_id']
@@ -91,13 +106,13 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
-    selects = secrchATT(item_id, pdf_name, type_id)
+    # selects = secrchATT(item_id, pdf_name, type_id)
+    #
-    if selects:
+    # if selects:
-        log.info(f'com_name:{com_name}已存在')
+    #     log.info(f'pdf_name:{pdf_name}已存在')
-        id = selects[0]
+    #     id = ''
-        return id
+    #     return id
-    else:
+    # else:
    Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
    values = (
@@ -108,7 +123,7 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
    cursor_.execute(Upsql, values)  # 插入
    cnx_.commit()  # 提交
    log.info("更新完成:{}".format(Upsql))
-        selects = secrchATT(item_id, pdf_name, type_id)
+    selects = secrchATT(item_id, pdf_name, type_id,order_by)
    id = selects[0]
    return id
@@ -125,6 +140,7 @@ def RequestUrl(url, payload, social_code,start_time):
            pass
    # 检查响应状态码
+    try:
        if response.status_code == 200:
            # 请求成功，处理响应数据
            # print(response.text)
@@ -137,6 +153,12 @@ def RequestUrl(url, payload, social_code,start_time):
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
            soup = ''
+    except:
+        log.error('请求失败:', url)
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
+        soup = ''
    return soup
 def getUrl(code, url_parms, Catagory2_parms):
@@ -215,7 +237,6 @@ def getUrl(code, url_parms, Catagory2_parms):
        }
    return dic_parms
 def ifInstert(short_name, social_code, pdf_url):
    ifexist = True
@@ -229,16 +250,19 @@ def ifInstert(short_name, social_code, pdf_url):
        return ifexist
    else:
        return ifexist
-def InsterInto(short_name, social_code, pdf_url):
+def InsterInto(social_code, pdf_url,pub_time):
+    insert = False
    # 信息插入数据库
    try:
-        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
        list_info = [
            social_code,
            pdf_url,
            '证监会',
            '1',
+            pub_time,
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
@@ -251,8 +275,18 @@ def InsterInto(short_name, social_code, pdf_url):
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert
 def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
+    #判断文件是否已经存在obs服务器中
+    # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
+    now_time = time.strftime("%Y-%m")
+    file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
+    response = obsClient.getObjectMetadata('zzsn', file_path)
+    if response.status >= 300:
+        log.info('=====文件不存在obs=====')
+        pass
+    else:
+        log.info(f'=====文件存在obs========{file_path}')
+        return False
    #上传至华为云服务器
    retData = uptoOBS(pdf_url,pdf_name,8,social_code)
    #附件插入att数据库
@@ -263,12 +297,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
        return False
    num = num + 1
    att_id = tableUpdate(retData,com_name,year,pdf_name,num)
-    content = retData['content']
+    if att_id:
-    if retData['state']:
        pass
    else:
-        log.info(f'====pdf解析失败====')
        return False
+    content = retData['content']
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
@@ -304,7 +337,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
            'message': '操作成功',
            'code': '200',
        }
-        print(dic_result)
+        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
@@ -316,14 +349,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
-        print(dic_result)
+        log.info(dic_result)
        return False
 # 采集信息
 def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库中获取到的基本信息
-    okCount = 0
-    errorCount = 0
    social_code = dic_info[2]
    short_name = dic_info[4]
    com_name = dic_info[1]
@@ -335,26 +365,26 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
    try:
        is_exist = soup.find('div',class_='con').text
        if is_exist == '没有查询到数据':
-            state = 1
+            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, '')
+            baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
            return
    except:
        pass
-    # 先获取页数
+    # # 先获取页数
-    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
+    # page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
+    #
-    total = re.findall(r'\d+', page)[0]
+    # total = re.findall(r'\d+', page)[0]
+    #
-    r_page = int(total) % 15
+    # r_page = int(total) % 15
-    if r_page == 0:
+    # if r_page == 0:
-        Maxpage = int(total) // 15
+    #     Maxpage = int(total) // 15
-    else:
+    # else:
-        Maxpage = int(total) // 15 + 1
+    #     Maxpage = int(total) // 15 + 1
-    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
+    # log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
-    # 首页和其他页不同，遍历 如果是首页 修改一下链接
+    # # 首页和其他页不同，遍历 如果是首页 修改一下链接
-    for i in range(1, Maxpage + 1):
+    for i in range(1,51):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
@@ -366,9 +396,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
        if soup == '':
            continue
        tr_list = soup.find('div', id='txt').find_all('tr')
-        pageIndex = 0
+        # pageIndex = 0
        for tr in tr_list[1:]:
-            pageIndex += 1
+            # pageIndex += 1
            td_list = tr.find_all('td')
            pdf_url_info = td_list[2]
            # print(pdf_url)
@@ -376,6 +406,12 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'')
            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
+            #todo:判断发布日期是否是日期格式
+            pattern = r"^\d{4}-\d{2}-\d{2}$"  # 正则表达式匹配YYYY-MM-DD格式的日期
+            if re.match(pattern, pub_time):
+                pass
+            else:
+                continue
            year = pub_time[:4]
            report_type = td_list[4].text.strip()
@@ -383,30 +419,22 @@ def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库
            ifexist = ifInstert(short_name, social_code, pdf_url)
            #如果不存在 ifexist = True
            if ifexist:
-                #     # 公告信息列表
-                #     okCount = okCount + 1
                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
-                log.info(f'======={short_name}========{code}===插入公告库成功')
                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)
                if result:
                    # 公告信息列表
-                    okCount = okCount + 1
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
+                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
+                    #发送kafka成功之后 再插入数据库
+                    insert = InsterInto(social_code,pdf_url,pub_time)
+                    if insert:
+                        log.info(f'===={social_code}========{name_pdf}=====插入库成功')
                    pass
                else:
-                    errorCount += 1
-                    # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    log.error(f'{short_name}=============解析或传输操作失败')
-                    # try:
-                    #     insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
-                    #     cursor_.execute(insert_err_sql)
-                    #     cnx_.commit()
-                    # except:
-                    #     pass
                    continue
            else:
                log.info(f'======={short_name}========{code}===已存在')
@@ -449,14 +477,15 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
+        # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
-        # social_code = '9110000071092841XX'
+        social_code = '91440500617540496Q'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
-        count = dic_info[16]
+        count = dic_info[17]
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
@@ -474,11 +503,14 @@ if __name__ == '__main__':
        com_name = dic_info[1]
        dic_parms = getUrl(code, url_parms, Catagory2_parms)
        dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
        if dic_parms:
            start_time_cj = time.time()
+            log.info(f'======开始处理{com_name}=====发行公告=======')
            SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
            start_time_ls = time.time()
+            log.info(f'======开始处理{com_name}=====临时报告=======')
            SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
            # UpdateInfoSql(retData,retData_ls,social_code)
@@ -487,11 +519,7 @@ if __name__ == '__main__':
            log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
            count += 1
            runType = 'NoticeReportCount'
-            baseCore.updateRun(code, runType, count)
+            baseCore.updateRun(social_code, runType, count)
    cursor.close()
    cnx.close()
-    # cursor_.close()
-    # cnx_.close()
-    # 释放资源
    baseCore.close()
--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -11,24 +11,28 @@ import logbook.more
 import pandas as pd
 import requests
 import zhconv
-import pymysql
 import redis
-from docx import Document
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from openpyxl import Workbook
 import langid
 #创建连接池
 import pymysql
-from pymysql import connections
 from DBUtils.PooledDB import PooledDB
 # import sys
 # sys.path.append('D://zzsn_spider//base//fdfs_client')
 from fdfs_client.client import get_tracker_conf, Fdfs_client
-tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
+tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
 client = Fdfs_client(tracker_conf)
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
 class BaseCore:
@@ -437,9 +441,9 @@ class BaseCore:
    #解析word文件页数
-    def doc_page(self,file_path):
+    # def doc_page(self,file_path):
-        doc = Document(file_path)
+    #     doc = Document(file_path)
-        return len(doc.sections)
+    #     return len(doc.sections)
    def pdf_content(self,resp_content):
        # 解析pdf文件内容
        content = ''
@@ -507,9 +511,9 @@ class BaseCore:
        # retData['page_size'] = page_size
        return retData
-    def secrchATT(self,item_id,file_name,type_id):
+    def secrchATT(self,item_id,file_name,type_id,order_by):
-        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
+        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
-        self.cursor_.execute(sel_sql, (item_id, file_name, type_id))
+        self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
        selects = self.cursor_.fetchone()
        return selects
@@ -527,13 +531,8 @@ class BaseCore:
            page_size = retData['page_size']
            create_time = retData['create_time']
            order_by = num
-            selects = self.secrchATT(item_id,file_name,type_id)
-            if selects:
-                self.getLogger().info(f'com_name:{com_name}已存在')
-                id = selects[0]
-                return id,full_path
-            else:
            Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
            values = (
@@ -544,11 +543,71 @@ class BaseCore:
            self.cursor_.execute(Upsql, values)  # 插入
            self.cnx_.commit()  # 提交
            self.getLogger().info("更新完成:{}".format(Upsql))
-                selects = self.secrchATT(item_id,file_name,type_id)
+            selects = self.secrchATT(item_id,file_name,type_id,order_by)
            id = selects[0]
            return id,full_path
+    # 获取文件大小
+    def convert_size(self,size_bytes):
+        # 定义不同单位的转换值
+        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+        i = 0
+        while size_bytes >= 1024 and i < len(units) - 1:
+            size_bytes /= 1024
+            i += 1
+        return f"{size_bytes:.2f} {units[i]}"
+    def uptoOBS(self,file_href,item_id,pathType,file_name):
+        headers = {}
+        category = os.path.splitext(file_href)[1]
+        retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
+                   'full_path': '',
+                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+                   'create_time': '', 'page_size': '', 'content': ''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                response = requests.get(file_href, headers=headers, verify=False, timeout=20)
+                file_size = int(response.headers.get('Content-Length'))
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                # name = file_name
+                if category in file_name:
+                    pass
+                else:
+                    file_name = file_name + '.' + category
+                result = obsClient.putContent('zzsn', f'{pathType}' + file_name, content=response.content)
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            # print(f'======pdf解析失败=====')
+            return retData
+        else:
+            try:
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                retData['state'] = True
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
+                retData['full_path'] = unquote(result['body']['objectUrl'])
+                retData['file_size'] = self.convert_size(file_size)
+                retData['create_time'] = time_now
+            except Exception as e:
+                print(f'error:{e}')
+                return retData
+            return retData

--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -224,6 +224,7 @@ def get_content1():
                        # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            num+=1
                            log.info('已采集----------跳过')
                            continue
                        try:
@@ -383,6 +384,7 @@ def get_content2():
                        # # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            num+=1
                            log.info('已采集----------跳过')
                            continue
                        try:
@@ -563,6 +565,7 @@ def get_content3():
                pub_time = li.split('<span>[')[1].split(']</span>')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    log.info('已采集----------跳过')
                    continue
                sendContent(href, headers,title,pub_time,num)
@@ -591,6 +594,7 @@ def get_content3():
                    # 判断是否已经爬取过
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        log.info('已采集----------跳过')
                        continue
                    title = doc_item('a').attr('title')
@@ -612,6 +616,7 @@ def get_content3():
 def bei_jing():
    num = 0
    start_time = time.time()
+    pathType = 'policy/beijing/'
    # 有反爬需要使用selenium
    # service = Service(r'D:/chrome/113/chromedriver.exe')
    # 配置selenium
@@ -664,6 +669,7 @@ def bei_jing():
            # 判断是否已经爬取过
            is_href = db_storage.find_one({'网址': href[0]})
            if is_href:
+                num+=1
                log.info('已采集----------跳过')
                continue
            # 对获取信息页面发送请求
@@ -712,7 +718,7 @@ def bei_jing():
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
-                    retData = baseCore.uploadToserver(file_href, '1667')
+                    retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
                    if retData['state']:
                        pass
                    else:
@@ -721,7 +727,7 @@ def bei_jing():
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
-                    file['href'] = 'http://114.115.215.96/' + full_path
+                    file['href'] = full_path
            # id_ = redefid(id_list)
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -754,7 +760,7 @@ def bei_jing():
            # id_list.append(id)
            num += 1
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
        bro.quit()
    except Exception as e:
        log.info(e)
@@ -763,6 +769,7 @@ def bei_jing():
 # 内蒙古
 def nei_meng_gu():
    start = time.time()
+    pathType = 'policy/neimenggu/'
    num = 0
    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
    try:
@@ -780,6 +787,7 @@ def nei_meng_gu():
            # todo:测试用 注释掉判重
            is_href = db_storage.find_one({'网址': real_href})
            if is_href:
+                num+=1
                continue
            try:
                # 获取所需信息
@@ -831,16 +839,16 @@ def nei_meng_gu():
                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
+                            # print(fu_jian_href)
                            # todo:附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1669')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
                            id_list.append(att_id)
-                            # # todo:将返回的地址更新到soup
-                            # fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
                print(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -881,6 +889,7 @@ def nei_meng_gu():
 # 吉林
 def ji_lin():
+    pathType = 'policy/jilin/'
    start = time.time()
    num = 0
    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
@@ -902,6 +911,7 @@ def ji_lin():
            title = a.find('a').text.replace('\n', '')
            is_href = db_storage.find_one({'网址': real_href})
            if is_href:
+                num+=1
                continue
            try:
                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
@@ -972,16 +982,17 @@ def ji_lin():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                file_name = fu_jian_href.text.strip()
-                                retData = baseCore.uploadToserver(fu_jian_href, '1670')
+                                # print(fu_jian_href)
+                                retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
                                id_list.append(att_id)
+                                #
-                                # todo:将返回的地址更新到soup
+                                # # todo:将返回的地址更新到soup
-                                li.find('a')['href'] = 'http://114.115.215.96/' + full_path
+                                li.find('a')['href'] = full_path
                            else:
                                continue
                else:
@@ -1009,16 +1020,17 @@ def ji_lin():
                        if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
-                            retData = baseCore.uploadToserver(fj_href, '1670')
+                            # print(fj_href)
+                            retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
                            id_list.append(att_id)
+                            #
-                            # todo:将返回的地址更新到soup
+                            # # todo:将返回的地址更新到soup
-                            fu_jian_href['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian_href['href'] = full_path
                        else:
                            continue
@@ -1062,7 +1074,7 @@ def ji_lin():
                        save_data(dic_news)
                    num = num + 1
            except Exception as e:
-                print(e)
+                log.info(e)
                pass
    except:
        pass
@@ -1073,6 +1085,7 @@ def ji_lin():
 def shang_hai():
    start = time.time()
+    pathType = 'policy/shanghai/'
    num = 0
    for page in range(1, 7):
@@ -1095,6 +1108,7 @@ def shang_hai():
                    href = 'https://www.gzw.sh.gov.cn' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
@@ -1154,7 +1168,7 @@ def shang_hai():
                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
-                            retData = baseCore.uploadToserver(fu_jian_href, '1671')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -1163,7 +1177,7 @@ def shang_hai():
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            a['href'] = 'http://114.115.215.96/' + full_path
+                            a['href'] = full_path
                        else:
                            continue
@@ -1205,6 +1219,7 @@ def shang_hai():
 # 浙江
 def zhe_jiang():
    start = time.time()
+    pathType = 'policy/zhejiang/'
    num = 0
    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
    try:
@@ -1227,6 +1242,7 @@ def zhe_jiang():
                href = 'http://gzw.zj.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
+                num+=1
                continue
            try:
                href_text = requests.get(url=href, headers=headers, verify=False)
@@ -1325,6 +1341,7 @@ def zhe_jiang():
 # 福建
 def fu_jian():
    error_tag = str(404)
+    pathType = 'policy/fujian/'
    num = 0
    start_time = time.time()
    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
@@ -1373,6 +1390,7 @@ def fu_jian():
                # print(real_href)
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # 文章是远程pdf
@@ -1384,7 +1402,7 @@ def fu_jian():
                        content = baseCore.pdf_content(resp_content)
                        contentwithtag = ''
                        # 文件上传至服务器
-                        retData = baseCore.uploadToserver(real_href, '1673')
+                        retData = baseCore.uptoOBS(real_href, '1673',pathType,title)
                        if retData['state']:
                            pass
                        else:
@@ -1420,7 +1438,7 @@ def fu_jian():
                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                                    # 找到附件后 上传至文件服务器
-                                    retData = baseCore.uploadToserver(fj_href, '1673')
+                                    retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -1428,7 +1446,7 @@ def fu_jian():
                                    att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                            source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
                            pub_source = source_.split('来源：')[1].split('发布时间：')[0].strip().lstrip()
@@ -1499,6 +1517,7 @@ def shan_dong():
                href = li.find('a')['href']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -1593,6 +1612,7 @@ def shan_dong():
 # 广东
 def guang_dong():
    start = time.time()
+    pathType = 'policy/guangdong/'
    num = 0
    url = 'http://gzw.gd.gov.cn/zcfg/index.html'
    try:
@@ -1620,6 +1640,7 @@ def guang_dong():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -1644,7 +1665,7 @@ def guang_dong():
                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                            # 附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fj_href, '1676')
+                            retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -1652,7 +1673,7 @@ def guang_dong():
                            att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num)
                            id_list.append(att_id)
                            # 将文件服务器的链接替换
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -1692,6 +1713,7 @@ def guang_dong():
 # 海南
 def hai_nan():
+    pathType = 'policy/hainan/'
    def hai_nan1():
        # 部门文件
        num = 0
@@ -1717,6 +1739,7 @@ def hai_nan():
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        try:
@@ -1759,7 +1782,7 @@ def hai_nan():
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    # 上传至文件服务器
-                                    retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                    retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -1767,7 +1790,7 @@ def hai_nan():
                                    att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                        except:
                            try:
                                # print(href)
@@ -1801,7 +1824,7 @@ def hai_nan():
                                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                            # print(f'----附件：{fu_jian_href}-----filename:{file_name}')
                                            # 附件上传至文件服务器
-                                            retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                            retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                            if retData['state']:
                                                pass
                                            else:
@@ -1809,7 +1832,7 @@ def hai_nan():
                                            # 更新到数据库
                                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                            id_list.append(att_id)
-                                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                            fu_jian['href'] = full_path
                                except:
                                    continue
@@ -1888,6 +1911,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -1959,6 +1983,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -2007,7 +2032,7 @@ def hai_nan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -2015,7 +2040,7 @@ def hai_nan():
                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                            # print(f'附件：{fu_jian_href}')
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -2065,6 +2090,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -2113,14 +2139,14 @@ def hai_nan():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                # 上传至文件服务器
-                                retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                id_list.append(att_id)
-                                fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                fu_jian['href'] = full_path
                                print(f'----附件：{fu_jian_href}')
                    else:
                        pass
@@ -2175,10 +2201,13 @@ def hai_nan():
                try:
                    is_href = db_storage.find_one({'网址': i_href})
                    if is_href:
+                        num+=1
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
+                        num+=1
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
+                        num+=1
                        continue
                    # print(f'中央----{i_href}----')
                    href_text = requests.get(url=i_href, headers=headers, verify=False)
@@ -2330,6 +2359,7 @@ def hai_nan():
 # 四川
 def si_chuan():
    num = 0
+    pathType = 'policy/sichuan/'
    start_time = time.time()
    for page in range(1, 3):
        if page == 1:
@@ -2349,9 +2379,10 @@ def si_chuan():
                    href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
-                    print(href)
+                    # print(href)
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
@@ -2374,14 +2405,14 @@ def si_chuan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 对附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1678')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
                            if retData['stste']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num)
                            id_list.append(att_id)
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                            # fu_jian_href_list.append(fu_jian_href)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -2423,6 +2454,7 @@ def si_chuan():
 # 广西
 def guang_xi():
    num = 0
+    pathType = 'policy/guangxi/'
    start_time = time.time()
    url_all = """
    http://gzw.gxzf.gov.cn/wjzx/2023nwj/  1
@@ -2463,6 +2495,7 @@ def guang_xi():
                    href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # print(href)
@@ -2498,7 +2531,7 @@ def guang_xi():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                # 附件上传至文件服务器
-                                retData = baseCore.uploadToserver(fu_jian_href, '1692')
+                                retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -2507,7 +2540,7 @@ def guang_xi():
                                att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # 将附件链接替换
-                                fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                fu_jian['href'] = full_path
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
@@ -2550,6 +2583,7 @@ def gui_zhou():
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/  11
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/  1
    """
+    pathType = 'policy/guizhou/'
    num = 0
    start_time = time.time()
    for page in range(0, 11):
@@ -2566,6 +2600,7 @@ def gui_zhou():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -2606,7 +2641,7 @@ def gui_zhou():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1694')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -2615,7 +2650,7 @@ def gui_zhou():
                            att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num)
                            id_list.append(att_id)
                            # 将附件链接替换
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -2655,6 +2690,7 @@ def gui_zhou():
 # 云南
 def yun_nan():
+    pathType = 'policy/yunnan/'
    def yun_nan1():
        """
        http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml  9
@@ -2679,6 +2715,7 @@ def yun_nan():
                        href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        fu_jian_href_list = []
@@ -2710,7 +2747,7 @@ def yun_nan():
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    try:
                                        # 附件上传至文件服务器
-                                        retData = baseCore.uploadToserver(fu_jian_href, '1679')
+                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
                                        if retData['state']:
                                            pass
                                        else:
@@ -2719,7 +2756,7 @@ def yun_nan():
                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
                                        id_list.append(att_id)
                                        # 将附件链接替换
-                                        fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                        fu_jian['href'] = full_path
                                    except:
                                        continue
                            href_resp.close()
@@ -2788,6 +2825,7 @@ def yun_nan():
                    href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        print(href)
@@ -2822,7 +2860,7 @@ def yun_nan():
                                    print(fu_jian_href)
                                    try:
                                        # 附件上传至文件服务器
-                                        retData = baseCore.uploadToserver(fu_jian_href, '1679')
+                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
                                        if retData['state']:
                                            pass
                                        else:
@@ -2831,7 +2869,7 @@ def yun_nan():
                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
                                        id_list.append(att_id)
                                        # 将附件链接替换
-                                        fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                        fu_jian['href'] = full_path
                                    except:
                                        continue
                            res_.close()
@@ -2890,6 +2928,7 @@ def chong_qing():
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
    """
    num = 0
+    pathType = 'policy/chongqing/'
    start_time = time.time()
    for page in range(0, 4):
        if page == 0:
@@ -2913,6 +2952,7 @@ def chong_qing():
                        href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        print(href)
@@ -2960,7 +3000,7 @@ def chong_qing():
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                try:
                                    # 附件上传至文件服务器
-                                    retData = baseCore.uploadToserver(fu_jian_href, '1693')
+                                    retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -2969,7 +3009,7 @@ def chong_qing():
                                    att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将附件链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                                except:
                                    continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -3011,6 +3051,7 @@ def chong_qing():
 # 天津
 def tian_jin():
+    pathType = 'policy/tianjin/'
    def tian_jin1():
        num = 0
        start_time = time.time()
@@ -3038,6 +3079,7 @@ def tian_jin():
                        href = i_href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
@@ -3082,7 +3124,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3090,7 +3132,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3160,6 +3202,7 @@ def tian_jin():
                        href = url.split('index')[0] + href.replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
@@ -3205,7 +3248,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3213,7 +3256,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3284,6 +3327,7 @@ def tian_jin():
                        href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
@@ -3332,7 +3376,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3340,7 +3384,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3388,6 +3432,7 @@ def tian_jin():
 # 新疆
 def xin_jiang():
+    pathType = 'policy/xinjiang/'
    def xin_jiang1():
        num = 0
        start_time = time.time()
@@ -3407,6 +3452,7 @@ def xin_jiang():
                        continue
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    #         href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
                    try:
@@ -3432,7 +3478,7 @@ def xin_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1682')
+                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3440,7 +3486,7 @@ def xin_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3509,6 +3555,7 @@ def xin_jiang():
                        href = 'http://gyzc.xjbt.gov.cn' + href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        href_res = requests.get(url=href, headers=headers, verify=False)
@@ -3530,7 +3577,7 @@ def xin_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1682')
+                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3538,7 +3585,7 @@ def xin_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3594,6 +3641,7 @@ def xin_jiang():
 # 山西
 def shan_xi():
+    pathType = 'policy/shanxi/'
    num = 0
    start_time = time.time()
    for page in range(1, 7):
@@ -3618,6 +3666,7 @@ def shan_xi():
                publishDate = tr.xpath('./td[2]/span/text()')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    if ".pdf" in href:
@@ -3648,7 +3697,7 @@ def shan_xi():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1684')
+                            retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3656,7 +3705,7 @@ def shan_xi():
                            att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    if len(contentWithTag) < 1:
@@ -3707,6 +3756,7 @@ def shan_xi():
 # 辽宁
 def liao_ning():
+    pathType = 'policy/liaoning/'
    num = 0
    start_time = time.time()
    for page in range(1, 3):
@@ -3727,6 +3777,7 @@ def liao_ning():
                        href = 'https://gzw.ln.gov.cn/' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -3758,7 +3809,7 @@ def liao_ning():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1685')
+                            retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3766,7 +3817,7 @@ def liao_ning():
                            att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    if len(contentWithTag) < 1:
@@ -3816,6 +3867,7 @@ def liao_ning():
 # 黑龙江
 def hei_long_jiang():
+    pathType = 'policy/heilongjiang/'
    num = 0
    start_time = time.time()
    for page in range(1, 3):
@@ -3837,6 +3889,7 @@ def hei_long_jiang():
                        pub_hao = ''
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        contentWithTag = text['data']['results'][row]['contentHtml']
@@ -3861,7 +3914,7 @@ def hei_long_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1687')
+                                retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3869,7 +3922,7 @@ def hei_long_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -3912,6 +3965,7 @@ def hei_long_jiang():
 # 江苏
 def jiang_su():
    num = 0
+    pathType = 'policy/jiangsu/'
    start_time = time.time()
    pagestart = 1
    pageend = 45
@@ -3940,6 +3994,7 @@ def jiang_su():
                title = a.text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -3967,7 +4022,7 @@ def jiang_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1687')
+                            retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3975,7 +4030,7 @@ def jiang_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4022,6 +4077,7 @@ def jiang_su():
 # 安徽
 def an_hui():
+    pathType = 'policy/anhui/'
    def an_hui1():
        num = 0
        start_time = time.time()
@@ -4037,6 +4093,7 @@ def an_hui():
                    href = doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False)
@@ -4068,7 +4125,7 @@ def an_hui():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1688')
+                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -4076,7 +4133,7 @@ def an_hui():
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -4164,7 +4221,7 @@ def an_hui():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1688')
+                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -4172,7 +4229,7 @@ def an_hui():
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -4223,6 +4280,7 @@ def jiang_xi():
    121-164
    """
    num = 0
+    pathType = 'policy/jiangxi/'
    start_time = time.time()
    startrecord = 1
    endrecord = 60
@@ -4248,6 +4306,7 @@ def jiang_xi():
            for href in href_list:
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
@@ -4289,7 +4348,7 @@ def jiang_xi():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1689')
+                            retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4297,7 +4356,7 @@ def jiang_xi():
                            att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4346,6 +4405,7 @@ def jiang_xi():
 # 河南
 def he_nan():
    num = 0
+    pathType = 'policy/henan/'
    start_time = time.time()
    for page in range(0, 7):
        if page == 0:
@@ -4361,6 +4421,7 @@ def he_nan():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                href_res = requests.get(url=href, headers=headers, verify=False)
                href_res.encoding = href_res.apparent_encoding
@@ -4383,7 +4444,7 @@ def he_nan():
                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
-                        retData = baseCore.uploadToserver(file_href, '1690')
+                        retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
                        if retData['state']:
                            pass
                        else:
@@ -4391,7 +4452,7 @@ def he_nan():
                        att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num)
                        id_list.append(att_id)
                        # todo:将返回的地址更新到soup
-                        file['href'] = 'http://114.115.215.96/' + full_path
+                        file['href'] =  full_path
                contentWithTag = str(soup.prettify())
                content = soup.text
@@ -4438,6 +4499,7 @@ def he_nan():
 # 湖南
 def hu_nan():
    num = 0
+    pathType = 'policy/hunan/'
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
@@ -4454,6 +4516,7 @@ def hu_nan():
                publishDate = doc_item('td:nth-child(3)').text()
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
                try:
@@ -4490,7 +4553,7 @@ def hu_nan():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1691')
+                            retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4498,7 +4561,7 @@ def hu_nan():
                            att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4538,6 +4601,7 @@ def hu_nan():
 # 甘肃
 def gan_su():
+    pathType = 'policy/gansu/'
    def gan_su1():
        num = 0
        start_time = time.time()
@@ -4581,6 +4645,7 @@ def gan_su():
                    publishDate = dd['publishDate']
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    for i in range(0, 4):
                        bro.get(href)
@@ -4609,7 +4674,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4617,7 +4682,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] =  full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4688,6 +4753,7 @@ def gan_su():
                    publishDate = dd['publishDate']
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    bro.get(href)
                    try:
@@ -4743,7 +4809,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4751,7 +4817,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4849,6 +4915,7 @@ def gan_su():
                publishDate = dd['publishDate']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    bro.get(href)
@@ -4900,7 +4967,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4908,7 +4975,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4958,6 +5025,7 @@ def gan_su():
 # 宁夏
 def ning_xia():
    num = 0
+    pathType = 'policy/ningxia/'
    start_time = time.time()
    for page in range(0, 3):
        if page == 0:
@@ -4976,6 +5044,7 @@ def ning_xia():
                publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
@@ -5001,7 +5070,7 @@ def ning_xia():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1697')
+                            retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -5009,7 +5078,7 @@ def ning_xia():
                            att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -5052,6 +5121,7 @@ def ning_xia():
 # 陕西
 def shanxi():
    num = 0
+    pathType = 'policy/shan_xi/'
    start_time = time.time()
    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
@@ -5072,6 +5142,7 @@ def shanxi():
                href = 'https://sxgz.shaanxi.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
+                num+=1
                continue
            try:
                res_href = requests.get(url=href, headers=headers)
@@ -5101,7 +5172,7 @@ def shanxi():
                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
-                        retData = baseCore.uploadToserver(file_href, '1680')
+                        retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
                        if retData['state']:
                            pass
                        else:
@@ -5109,7 +5180,7 @@ def shanxi():
                        att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num)
                        id_list.append(att_id)
                        # todo:将返回的地址更新到soup
-                        file['href'] = 'http://114.115.215.96/' + full_path
+                        file['href'] = full_path
                # id_ = redefid(id_list)
                contentWithTag = str(soup.prettify())
                content = soup.text
@@ -5152,6 +5223,7 @@ def shanxi():
 # 西藏
 def xi_zang():
    start_time = time.time()
+    pathType = 'policy/xizang/'
    url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
                'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
    for url in url_list:
@@ -5169,6 +5241,7 @@ def xi_zang():
                title = li.find('a').text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    res_href = requests.get(url=href, headers=headers)
@@ -5194,7 +5267,7 @@ def xi_zang():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1695')
+                            retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -5202,7 +5275,7 @@ def xi_zang():
                            att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    # todo:替换完成之后，将附件上传至文件服务器
@@ -5242,6 +5315,7 @@ def xi_zang():
 # 青海
 def qing_hai():
+    pathType = 'policy/qinghai/'
    def qing_hai1():
        num = 0
        start_time = time.time()
@@ -5259,6 +5333,7 @@ def qing_hai():
                    durl = tr.find('a').get('href')
                    is_href = db_storage.find_one({'网址': durl})
                    if is_href:
+                        num+=1
                        log.info('已采集----------跳过')
                        continue
                    title = tr.find('a').text
@@ -5297,7 +5372,7 @@ def qing_hai():
                                att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        # todo:替换完成之后，将附件上传至文件服务器
@@ -5659,41 +5734,41 @@ def hu_bei():
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
 if __name__ == '__main__':
-    get_content1()
+    # get_content1()
-    get_content2()
+    # get_content2()
-    get_content3()
+    # get_content3()
-    bei_jing()
+    # bei_jing()
-    nei_meng_gu()
+    # nei_meng_gu()
    ji_lin()
-    shang_hai()
+    # shang_hai()
-    zhe_jiang()
+    # zhe_jiang()
-    fu_jian()
+    # fu_jian()
-    shan_dong()
+    # shan_dong()
-    guang_dong()
+    # guang_dong()
-    hai_nan()
+    # hai_nan()
-    si_chuan()
+    # si_chuan()
-    guang_xi()
+    # guang_xi()
-    gui_zhou()
+    # gui_zhou()
-    yun_nan()
+    # yun_nan()
-    chong_qing()
+    # chong_qing()
-    tian_jin()
+    # tian_jin()
-    xin_jiang()
+    # xin_jiang()
-    shan_xi()
+    # shan_xi()
-    liao_ning()
+    # liao_ning()
-    hei_long_jiang()
+    # hei_long_jiang()
-    jiang_su()
+    # jiang_su()
-    an_hui()
+    # an_hui()
-    jiang_xi()
+    # jiang_xi()
-    he_nan()
+    # he_nan()
-    hu_nan()
+    # hu_nan()
-    gan_su()
+    # gan_su()
-    ning_xia()
+    # ning_xia()
-    xi_zang()
+    # xi_zang()
-    shanxi()
+    # shanxi()
-    qing_hai()
+    # qing_hai()
-    he_bei()
+    # he_bei()
-    qing_hai()
+    # qing_hai()
-    current_time = datetime.datetime.now()
+    # current_time = datetime.datetime.now()
-    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
-    sleep_seconds = (midnight_time - current_time).total_seconds()
+    # sleep_seconds = (midnight_time - current_time).total_seconds()
-    time.sleep(sleep_seconds)
+    # time.sleep(sleep_seconds)
--- a/test/裁判文书网.js
+++ b/test/裁判文书网.js
+function r(size){
+function r(size){
+	var str = "",
+	arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
+	for(var i=0; i<size; i++){
+		str += arr[Math.round(Math.random() * (arr.length-1))];
+	}
+	return str;
+}
+function strTobinary(str) {
+	var result = [];
+	var list = str.split("");
+	for (var i = 0; i < list.length; i++) {
+		if (i != 0) {
+			result.push(" ");
+		}
+		var item = list[i];
+		var binaryStr = item.charCodeAt().toString(2);
+		result.push(binaryStr);
+	};
+	return result.join("");
+}
+function cipher() {
+	var date = new Date();
+	var timestamp = date.getTime().toString();
+	var salt = r(24);
+	var year = date.getFullYear().toString();
+	var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
+			.getMonth()).toString();
+	var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
+			.toString();
+	var iv = year + month + day;
+	return salt
+}
+function des(salt,iv,enc) {
+	// var enc = des3(timestamp, salt, iv).toString();
+	var str = salt + iv + enc;
+	var ciphertext = strTobinary(str);
+	return ciphertext;
+}
+function token(){
+	var size = 24
+	var str = "",
+	arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
+	for(var i=0; i<size; i++){
+		str += arr[Math.round(Math.random() * (arr.length-1))];
+	}
+	return str;
+}
+function pageid() {
+    var n = 32
+    var text = "";
+    var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+    for (var i = 0; i < n; i++)
+        text += possible.charAt(Math.floor(Math.random() * possible.length));
+    return text;
+}
+// console.log(cipher());
\ No newline at end of file
--- a/test/裁判文书网列表正文.py
+++ b/test/裁判文书网列表正文.py
+import base64
+import base64
+import json
+import random
+import time
+import execjs
+import requests
+import urllib3
+from Crypto.Cipher import DES3
+from base.BaseCore import BaseCore
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+baseCore=BaseCore()
+log=baseCore.getLogger()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+#保存错误日志
+def insertBadSql(error):
+    insertSql = f"insert into cpwsw_log (code,description,success,create_time,user,keyword,msg) values (%s,%s,%s,now(),%s,%s,%s)"
+    cursor_.execute(insertSql, tuple(error))
+    cnx_.commit()
+#cookie的处理
+def updateCookie(cookie,type):
+    if type==2:
+        #session失效，删除token
+        cursor_.execute("delete from cpwsw_user  where cookie=%s",[cookie])
+    if type ==1:
+        #正常使用
+        cursor_.execute("update cpwsw_user set update_time=now()   where cookie=%s",[cookie])
+    if type ==3:
+        #未知异常
+        cursor_.execute("update cpwsw_user set fenghao_time=now()   where cookie=%s",[cookie])
+    cnx_.commit()
+# 将DES3加密解密设置为类
+class EncryptDate:
+    def __init__(self, pianyi, key):
+        self.key = key  # 初始化密钥
+        self.iv = bytes(pianyi,encoding='utf8') # 偏移量
+        self.length = DES3.block_size  # 初始化数据块大小
+        self.des3 = DES3.new(self.key, DES3.MODE_CBC, self.iv)  # 初始化AES,CBC模式的实例
+        # 截断函数，去除填充的字符
+        self.unpad = lambda date: date[0:-ord(date[-1])]
+    def pad(self, text):
+        """
+        #填充函数，使被加密数据的字节码长度是block_size的整数倍
+        """
+        count = len(text.encode('utf-8'))
+        add = self.length - (count % self.length)
+        entext = text + (chr(add) * add)
+        return entext
+    def encrypt(self, encrData):  # 加密函数
+        res = self.des3.encrypt(self.pad(encrData).encode("utf8"))
+        msg = str(base64.b64encode(res), encoding="utf8")
+        # msg =  res.hex()
+        return msg
+    def decrypt(self, decrData):  # 解密函数
+        res = base64.decodebytes(decrData.encode("utf8"))
+        # res = bytes.fromhex(decrData)
+        msg = self.des3.decrypt(res).decode("utf8")
+        return self.unpad(msg)
+with open('裁判文书网.js', 'r', encoding='utf-8') as f:
+    jstext = f.read()
+# 在python中调用js代码
+ctx = execjs.compile(jstext)
+print("ok")
+url = 'https://wenshu.court.gov.cn/website/parse/rest.q4w'
+#获取登录Cookie
+def getCookie():
+    cursor_.execute(
+        f"select user,cookie from cpwsw_user  where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
+    row = cursor_.fetchall()
+    if row:
+        pass
+    else:
+        # 没有查到token
+        log.info("没有拿到token")
+        return False
+    return row[0]
+#获取正文
+def getDoc(info_id,userCookie):
+    headers = {
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Cookie': userCookie,
+        'Host': 'wenshu.court.gov.cn',
+        'Referer': 'https://wenshu.court.gov.cn',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+    }
+    salt = ctx.call('cipher')
+    date_now = time.strftime("%Y%m%d",time.localtime())
+    t = time.time()
+    eg = EncryptDate(date_now,salt)  # 偏移量和秘钥，这里密钥的长度必须是16的倍数
+    des = eg.encrypt(str(t))  #DES3加密
+    ciphertext = ctx.call("des",salt,date_now,des)
+    token = ctx.call("token")
+    data_info = {
+        'docId':info_id,
+        'ciphertext':ciphertext,
+        'cfg':'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch',
+        '__RequestVerificationToken':token,
+        'wh':'250',
+        'ww':'1536',
+        'cs':'0'
+    }
+    ip = baseCore.get_proxy()
+    res_info = requests.post(url=url,headers=headers,data=data_info,proxies=ip, verify=False)
+    #{'code': -12, 'description': None, 'secretKey': None, 'result': None, 'success': False}  SESSION的值不对
+    #{'code': 9, 'description': '没有权限请求接口,cfg=com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch', 'secretKey': None, 'result': None, 'success': False}
+    #'{"code":1,"description":"权限已失效","secretKey":null,"result":null,"success":true}'
+    code = res_info.json()["code"]
+    if code != 1:
+        log.error(f"正文获取失败：----{res_info.json()}")
+        # 没有正常返回
+        return ""
+    try:
+        eg_jie = EncryptDate(date_now,res_info.json()['secretKey'])
+        res_jie = eg_jie.decrypt(res_info.json()['result'])  #DES3解密
+    except Exception as e:
+        return ""
+        log.error(f"正文获取失败：----{e}")
+    return res_jie
+#
+def insertCpwsList(keyword,page,list_info,userCookie):
+    listCount = 0
+    repetCount = 0
+    insertCount = 0
+    for one_info in list_info:
+        listCount = listCount + 1
+        info_title = one_info['1']
+        info_time = one_info['31']
+        info_address = one_info['2']
+        info_yuanyou = one_info['26']
+        info_bianhao = one_info['7']
+        info_id = one_info['rowkey']
+        selectCountSql = f"select count(1) from cpwsw_list where keyword=%s and rowkey=%s"
+        cursor_.execute(selectCountSql,[keyword,info_id])
+        count = cursor_.fetchone()[0]
+        if count > 0:
+            repetCount = repetCount + 1
+            continue
+        else:
+            insertCount = insertCount + 1
+            try:
+                # 获取正文
+                log.info("开始采集正文")
+                content = getDoc(info_id,userCookie)
+                log.info("结束采集正文，开始休眠")
+                time.sleep(random.randint(60, 180))
+                if content=='':
+                    log.info("采集到的正文为空")
+                    continue
+                insertSql = f"insert into cpwsw_list (keyword,title,time,address,yuanyou,bianhao,rowkey,state,create_time,content) " \
+                            f"values (%s,%s,%s,%s,%s,%s,%s,0,now(),%s)"
+                cursor_.execute(insertSql, [keyword,info_title,info_time,info_address,info_yuanyou,info_bianhao,info_id,content])
+                cnx_.commit()
+                updateCookie(userCookie, 1)
+            except Exception as e:
+                log.error(f"保存数据库失败：{e}")
+    log.info(f"---{keyword}--第{page}页----总数：{listCount}---重复数：{repetCount}---新增数：{insertCount}-------------")
+    if listCount == 0:
+        # 列表为空认为结束
+        return True
+    if repetCount >= listCount / 2:
+        # 重复数量大于等于一半认为结束
+        return True
+    # 没有结束
+    return False
+def getList(keyword,page):
+    userAndCookie = getCookie()
+    if userAndCookie:
+        pass
+    else:
+        log.info("没有拿到token,开始递归")
+        while True:
+            log.info("没有拿到token,开始休眠")
+            time.sleep(60)
+            log.info("没有拿到token,结束休眠")
+            userAndCookie = getCookie()
+            if userAndCookie:
+                break
+    user = userAndCookie[0]
+    userCookie = userAndCookie[1]
+    log.info(f"获取到user----{user}")
+    headers = {
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Cookie': userCookie,
+        'Host': 'wenshu.court.gov.cn',
+        'Referer': 'https://wenshu.court.gov.cn',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+    }
+    salt = ctx.call('cipher')
+    date_now = time.strftime("%Y%m%d", time.localtime())
+    t = time.time()
+    eg = EncryptDate(date_now, salt)  # 偏移量和秘钥，这里密钥的长度必须是16的倍数
+    des = eg.encrypt(str(t))  # DES3加密
+    ciphertext = ctx.call("des", salt, date_now, des)
+    pageId = ctx.call("pageid")
+    token = ctx.call("token")
+    search_key = [{"key": "s21", "value": f"{keyword}"}]
+    data = {
+        'pageId':pageId,
+        's21': keyword,
+        'sortFields': 's51:desc',  # 按裁判日期排序
+        'ciphertext': ciphertext,
+        'pageNum': page,
+        'pageSize': '5',
+        'queryCondition': str(search_key),
+        'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc',
+        '__RequestVerificationToken': token,
+        'wh': '403',
+        'ww': '1531',
+        'cs': '0'
+    }
+    res = requests.post(url=url, headers=headers, data=data)
+    code = res.json()["code"]
+    if code!=1:
+        #没有正常返回
+        #记录信息 删除登录信息
+        error = [res.json()["code"], res.json()["description"],  res.json()["success"], user, keyword,'']
+        insertBadSql(tuple(error))
+        updateCookie(userCookie, 3)
+        return getList(keyword, page)
+    eg_jie = EncryptDate(date_now, res.json()['secretKey'])
+    res_jie = eg_jie.decrypt(res.json()['result'])
+    res_json = json.loads(res_jie)  # 将解密后的数据转换为json格式
+    list_info = res_json['queryResult']['resultList']
+    return insertCpwsList(keyword, page,list_info,userCookie)
+#
+def doJob(keyword):
+    log.info(f"======{keyword}----开始采集=======")
+    for page in range(1,6):
+        retFlag = getList(keyword, page)
+        time.sleep(random.randint(60,180))
+        if retFlag:
+            #结束 跳出该公众号
+            break
+        else:
+            #没有结束
+            pass
+    log.info(f"======{keyword}---------结束采集=======")
+def test():
+    pass
+if __name__=="__main__":
+    while True:
+        keyword = baseCore.redicPullData('cpwsqy')
+        if keyword == 'None' or keyword == None:
+            log.info("redis已经没有数据了，重新放置数据")
+            break
+        doJob(keyword)
+    baseCore.close()
\ No newline at end of file