Merge remote-tracking branch 'origin/master'

d82da41e · 丁双波 · f3122a28 · 63cac106 · d82da41e · d82da41e
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -5,22 +5,18 @@ import socket
 import sys
 import time
-import fitz
 import logbook
 import logbook.more
 import pandas as pd
 import requests
 import zhconv
-import pymysql
 import redis
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
-from openpyxl import Workbook
 import langid
 #创建连接池
 import pymysql
-from pymysql import connections
 from DBUtils.PooledDB import PooledDB
 # import sys
 # sys.path.append('D://zzsn_spider//base//fdfs_client')
@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB
 from fdfs_client.client import get_tracker_conf, Fdfs_client
 tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
 client = Fdfs_client(tracker_conf)
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
 class BaseCore:
@@ -659,12 +664,10 @@ class BaseCore:
            create_time = retData['create_time']
            order_by = num
            selects = self.secrchATT(item_id,year,type_id)
-            # sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
-            # self.cursor.execute(sel_sql, (item_id, year,type_id))
-            # selects = self.cursor.fetchone()
            if selects:
-                self.getLogger().info(f'com_name:{com_name}已存在')
+                self.getLogger().info(f'com_name:{com_name}--{year}已存在')
-                id = selects[0]
+                id = ''
                return id
            else:
                Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
@@ -695,6 +698,80 @@ class BaseCore:
            log = self.getLogger()
            log.info('======保存企业CIK失败=====')
+    #上传至obs华为云服务器，并解析破地方的内容和页数
+    # 获取文件大小
+    def convert_size(self,size_bytes):
+        # 定义不同单位的转换值
+        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+        i = 0
+        while size_bytes >= 1024 and i < len(units) - 1:
+            size_bytes /= 1024
+            i += 1
+        return f"{size_bytes:.2f} {units[i]}"
+    def obsexist(self,file_path):
+        # # 文件路径
+        # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
+        # 检查文件是否存在
+        response = obsClient.getObjectMetadata('zzsn', file_path)
+        if response.status >= 300:
+            self.getLogger().info('=====文件不存在obs=====')
+        else:
+            self.getLogger().info(f'=====文件存在obs========{file_path}')
+    def uptoOBS(self,pdf_url, name_pdf,type_id, social_code,pathType,taskType,start_time):
+        headers = {}
+        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
+                   'full_path': '',
+                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+                   'create_time': '', 'page_size': '', 'content': ''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+                file_size = int(response.headers.get('Content-Length'))
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                name = name_pdf + '.pdf'
+                now_time = time.strftime("%Y-%m")
+                result = obsClient.putContent('zzsn', f'{pathType}{now_time}/' + name, content=response.content)
+                with fitz.open(stream=response.content, filetype='pdf') as doc:
+                    page_size = doc.page_count
+                    for page in doc.pages():
+                        retData['content'] += page.get_text()
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            # print(f'======pdf解析失败=====')
+            return retData
+        else:
+            try:
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                retData['state'] = True
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
+                retData['full_path'] = unquote(result['body']['objectUrl'])
+                retData['file_size'] = self.convert_size(file_size)
+                retData['create_time'] = time_now
+                retData['page_size'] = page_size
+            except Exception as e:
+                state = 0
+                takeTime = self.getTimeCost(start_time, time.time())
+                self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+                return retData
+            return retData

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -475,7 +475,14 @@ def kegaishifan():
 #双百企业
 def shuangbaiqiye():
-    pass
+    cnx, cursor = connectSql()
+    query = "SELECT CompanyName FROM Hundred"
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cnx.commit()
+    com_namelist = [item[0] for item in result]
+    for item in com_namelist:
+        r.rpush('hundred:baseinfo', item)
 #专精特新
 def zhuangjingtexind():
@@ -484,7 +491,8 @@ def zhuangjingtexind():
 if __name__ == "__main__":
    start = time.time()
    # danxiangguanjun()
-    kegaishifan()
+    # kegaishifan()
+    shuangbaiqiye()
    # NoticeEnterprise()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()

--- a/comData/newlist/champion/BaseCore.py
+++ b/comData/newlist/champion/BaseCore.py
@@ -541,7 +541,10 @@ class BaseCore:
        self.cursor.execute(query)
        token_list = self.cursor.fetchall()
        self.cnx.commit()
+        try:
            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
        return token
    # 删除失效的token

--- a/comData/newlist/hundred/BaseCore.py
+++ b/comData/newlist/hundred/BaseCore.py
+# 核心工具包
+import os
+import random
+import socket
+import sys
+import time
+import fitz
+import logbook
+import logbook.more
+import pandas as pd
+import requests
+import zhconv
+import pymysql
+import redis
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from openpyxl import Workbook
+import langid
+#创建连接池
+import pymysql
+from pymysql import connections
+from DBUtils.PooledDB import PooledDB
+# import sys
+# sys.path.append('D://zzsn_spider//base//fdfs_client')
+from fdfs_client.client import get_tracker_conf, Fdfs_client
+tracker_conf = get_tracker_conf('/base/client.conf')
+client = Fdfs_client(tracker_conf)
+# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
+class BaseCore:
+    # 序列号
+    __seq = 0
+    # 代理池 数据库连接
+    # __cnx_proxy =None
+    # __cursor_proxy = None
+    cnx = None
+    cursor = None
+    cnx_ = None
+    cursor_ = None
+    r = None
+    # agent 池
+    __USER_AGENT_LIST = [
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
+        'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
+        'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
+        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
+        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
+        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
+        'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
+        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
+        'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
+        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
+        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
+        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
+        'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
+        'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
+        'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
+        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
+        'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
+        'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
+        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
+        'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
+    ]
+    #Android agent池
+    __USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
+    def __init__(self):
+        # self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
+        #                                    charset='utf8mb4')
+        # self.__cursor_proxy = self.__cnx_proxy.cursor()
+        self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                                   charset='utf8mb4')
+        self.cursor = self.cnx.cursor()
+        #11数据库
+        self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
+                               charset='utf8mb4')
+        self.cursor_ = self.cnx_.cursor()
+        # 连接到Redis
+        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+        self.pool_caiji = PooledDB(
+            creator=pymysql,
+            maxconnections=5,
+            mincached=2,
+            maxcached=5,
+            blocking=True,
+            host='114.115.159.144',
+            port=3306,
+            user='caiji',
+            password='zzsn9988',
+            database='caiji',
+            charset='utf8mb4'
+        )
+    def close(self):
+        try:
+            self.cursor.close()
+            self.cnx.close()
+        except :
+            pass
+    # 计算耗时
+    def getTimeCost(self,start, end):
+        seconds = int(end - start)
+        m, s = divmod(seconds, 60)
+        h, m = divmod(m, 60)
+        if (h > 0):
+            return "%d小时%d分钟%d秒" % (h, m, s)
+        elif (m > 0):
+            return "%d分钟%d秒" % (m, s)
+        elif (seconds > 0):
+            return "%d秒" % (s)
+        else:
+            ms = int((end - start) * 1000)
+            return "%d毫秒" % (ms)
+    # 当前时间格式化
+    # 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
+    # 2 : 010101120000 %y%m%d%H%M%S
+    # 时间戳 3:1690179526555  精确到秒
+    def getNowTime(self, type):
+        now_time = ""
+        if type == 1:
+            now_time = time.strftime("%Y-%m-%d %H:%M:%S")
+        if type == 2:
+            now_time = time.strftime("%y%m%d%H%M%S")
+        if type == 3:
+            now_time = int(time.time() * 1000)
+        return now_time
+    # 获取流水号
+    def getNextSeq(self):
+        self.__seq += 1
+        if self.__seq > 1000:
+            self.__seq = 0
+        return self.getNowTime(2) + str(self.__seq).zfill(3)
+    # 获取信用代码
+    def getNextXydm(self):
+        self.__seq += 1
+        if self.__seq > 1000:
+            self.__seq = 0
+        return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
+    # 日志格式
+    def logFormate(self,record, handler):
+        formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
+            date=record.time,  # 日志时间
+            level=record.level_name,  # 日志等级
+            filename=os.path.split(record.filename)[-1],  # 文件名
+            func_name=record.func_name,  # 函数名
+            lineno=record.lineno,  # 行号
+            msg=record.message  # 日志内容
+        )
+        return formate
+    # 获取logger
+    def getLogger(self,fileLogFlag=True, stdOutFlag=True):
+        dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
+        dirname = os.path.join(dirname, "logs")
+        filename = filename.replace(".py", "") + ".log"
+        if not os.path.exists(dirname):
+            os.mkdir(dirname)
+        logbook.set_datetime_format('local')
+        logger = logbook.Logger(filename)
+        logger.handlers = []
+        if fileLogFlag:  # 日志输出到文件
+            logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
+                                                       bubble=True, encoding='utf-8')
+            logFile.formatter = self.logFormate
+            logger.handlers.append(logFile)
+        if stdOutFlag:  # 日志打印到屏幕
+            logStd = logbook.more.ColorizedStderrHandler(bubble=True)
+            logStd.formatter = self.logFormate
+            logger.handlers.append(logStd)
+        return logger
+    # 获取随机的userAgent
+    def getRandomUserAgent(self):
+        return random.choice(self.__USER_AGENT_LIST)
+    # 获取代理
+    def get_proxy(self):
+        sql = "select proxy from clb_proxy"
+        self.cursor.execute(sql)
+        proxy_lists = self.cursor.fetchall()
+        ip_list = []
+        for proxy_ in proxy_lists:
+            ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+        proxy_list = []
+        for str_ip in ip_list:
+            str_ip_list = str_ip.split('-')
+            proxyMeta = "http://%(host)s:%(port)s" % {
+                "host": str_ip_list[0],
+                "port": str_ip_list[1],
+            }
+            proxy = {
+                "HTTP": proxyMeta,
+                "HTTPS": proxyMeta
+            }
+            proxy_list.append(proxy)
+        return proxy_list[random.randint(0, 3)]
+    #字符串截取
+    def getSubStr(self,str,beginStr,endStr):
+        if beginStr=='':
+            pass
+        else:
+            begin=str.rfind(beginStr)
+            if begin==-1:
+                begin=0
+            str=str[begin:]
+        if endStr=='':
+            pass
+        else:
+            end=str.rfind(endStr)
+            if end==-1:
+                pass
+            else:
+                str = str[0:end+1]
+        return str
+    # 繁体字转简体字
+    def hant_2_hans(self,hant_str: str):
+        '''
+        Function: 将 hant_str 由繁体转化为简体
+        '''
+        return zhconv.convert(hant_str, 'zh-hans')
+    # 判断字符串里是否含数字
+    def str_have_num(self,str_num):
+        panduan = False
+        for str_1 in str_num:
+            ppp = str_1.isdigit()
+            if ppp:
+                panduan = ppp
+        return panduan
+    # # 从Redis的List中获取并移除一个元素
+    # def redicPullData(self,type,key):
+    # #1 表示国内 2 表示国外
+    #     if type == 1:
+    #         gn_item = self.r.lpop(key)
+    #         return gn_item.decode() if gn_item else None
+    #     if type == 2:
+    #         gw_item = self.r.lpop(key)
+    #         return gw_item.decode() if gw_item else None
+    # 从Redis的List中获取并移除一个元素
+    def redicPullData(self,key):
+        item = self.r.lpop(key)
+        return item.decode() if item else None
+    # 获得脚本进程PID
+    def getPID(self):
+        PID = os.getpid()
+        return PID
+    # 获取本机IP
+    def getIP(self):
+        IP = socket.gethostbyname(socket.gethostname())
+        return IP
+    def mkPath(self,path):
+        folder = os.path.exists(path)
+        if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
+            os.makedirs(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
+        else:
+            pass
+    # 生成google模拟浏览器  必须传入值为googledriver位置信息
+    # headless用于决定是否为无头浏览器,初始默认为无头浏览器
+    # 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
+    # 无头浏览器用于后续对信息采集时不会有浏览器一直弹出，
+    def buildDriver(self, path, headless=True):
+        service = Service(path)
+        chrome_options = webdriver.ChromeOptions()
+        if headless:
+            chrome_options.add_argument('--headless')
+            chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_experimental_option(
+            "excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option('useAutomationExtension', False)
+        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+        chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
+        # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
+        driver = webdriver.Chrome(options=chrome_options, service=service)
+        # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
+        #     js = f.read()
+        #
+        # driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        #     "source": js
+        # })
+        return driver
+    # 根据社会信用代码获取企业信息
+    def getInfomation(self, com_name):
+        data = []
+        try:
+            sql = f"SELECT * FROM Hundred WHERE CompanyName = '{com_name}'"
+            # self.cursor.execute(sql)
+            # data = self.cursor.fetchone()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            data = cursor.fetchone()
+            conn.commit()
+            data = list(data)
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('=========数据库操作失败========')
+        return data
+    # 更新企业采集次数
+    def updateRun(self, social_code, runType, count):
+        try:
+            sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
+            # self.cursor.execute(sql_update)
+            # self.cnx.commit()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql_update)
+            conn.commit()
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('======更新数据库失败======')
+    # 保存日志入库
+    def recordLog(self, xydm, taskType, state, takeTime, url, e):
+        try:
+            createTime = self.getNowTime(1)
+            ip = self.getIP()
+            pid = self.getPID()
+            sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
+            values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
+            # try:
+            #     self.cursor.execute(sql, values)
+            # except Exception as e:
+            #     print(e)
+            # self.cnx.commit()
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql,values)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存日志失败=====')
+    #获取企查查token
+    def GetToken(self):
+        #获取企查查token
+        query = "select token from QCC_token "
+        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
+        self.cursor.execute(query)
+        token_list = self.cursor.fetchall()
+        self.cnx.commit()
+        try:
+            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
+        return token
+    # 删除失效的token
+    def delete_token(self,token):
+        deletesql = f"delete from QCC_token where token='{token}' "
+        self.cursor.execute(deletesql)
+        self.cnx.commit()
+    #获取天眼查token
+    def GetTYCToken(self):
+        query = 'select token from TYC_token'
+        self.cursor.execute(query)
+        token = self.cursor.fetchone()[0]
+        self.cnx.commit()
+        return token
+    #检测语言
+    def detect_language(self, text):
+        # 使用langid.py判断文本的语言
+        result = langid.classify(text)
+        if result == '':
+            return 'cn'
+        if result[0] == '':
+            return 'cn'
+        return result[0]
+    #追加接入excel
+    def writerToExcel(self,detailList,filename):
+        # filename='baidu搜索.xlsx'
+        # 读取已存在的xlsx文件
+        existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
+        # 创建新的数据
+        new_data = pd.DataFrame(data=detailList)
+        # 将新数据添加到现有数据的末尾
+        combined_data = existing_data.append(new_data, ignore_index=True)
+        # 将结果写入到xlsx文件
+        combined_data.to_excel(filename, index=False)
+        # return combined_data
+    #对失败或者断掉的企业 重新放入redis
+    def rePutIntoR(self,key,item):
+        self.r.rpush(key, item)
+    #增加计数器的值并返回增加后的值
+    def incrSet(self,key):
+        # 增加计数器的值并返回增加后的值
+        new_value = self.r.incr(key)
+        print("增加后的值：", new_value)
+        return new_value
+    #获取key剩余的过期时间
+    def getttl(self,key):
+        # 获取key的剩余过期时间
+        ttl = self.r.ttl(key)
+        print("剩余过期时间：", ttl)
+        # 判断key是否已过期
+        if ttl < 0:
+            # key已过期，将key的值重置为0
+            self.r.set(key, 0)
+            self.r.expire(key, 3600)
+            time.sleep(2)
+    #上传至文件服务器,并解析pdf的内容和页数
+    def upLoadToServe(self,pdf_url,type_id,social_code):
+        headers = {}
+        retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
+                   'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
+                   'create_time':'','page_size':'','content':''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
+                with fitz.open(stream=resp_content, filetype='pdf') as doc:
+                    page_size = doc.page_count
+                    for page in doc.pages():
+                        retData['content'] += page.get_text()
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            print(f'======pdf解析失败=====')
+            return retData
+        else:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
+            retData['full_path'] = bytes.decode(result['Remote file_id'])
+            retData['file_size'] = result['Uploaded size']
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+            return retData
+    def secrchATT(self,item_id,year,type_id):
+        sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
+        self.cursor_.execute(sel_sql, (item_id, year, type_id))
+        selects = self.cursor_.fetchone()
+        return selects
+    #插入到att表 返回附件id
+    def tableUpdate(self,retData,com_name,year,pdf_name,num):
+            item_id = retData['item_id']
+            type_id = retData['type_id']
+            group_name = retData['group_name']
+            path = retData['path']
+            full_path = retData['full_path']
+            category = retData['category']
+            file_size = retData['file_size']
+            status = retData['status']
+            create_by = retData['create_by']
+            page_size = retData['page_size']
+            create_time = retData['create_time']
+            order_by = num
+            selects = self.secrchATT(item_id,year,type_id)
+            # sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
+            # self.cursor.execute(sel_sql, (item_id, year,type_id))
+            # selects = self.cursor.fetchone()
+            if selects:
+                self.getLogger().info(f'com_name:{com_name}已存在')
+                id = selects[0]
+                return id
+            else:
+                Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+                values = (
+                    year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+                    status, create_by,
+                    create_time, page_size)
+                self.cursor_.execute(Upsql, values)  # 插入
+                self.cnx_.commit()  # 提交
+                self.getLogger().info("更新完成:{}".format(Upsql))
+                selects = self.secrchATT(item_id,year,type_id)
+                id = selects[0]
+                return id
+    # 更新企业的CIK
+    def updateCIK(self,social_code,cik):
+        try:
+            sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存企业CIK失败=====')
--- a/comData/newlist/hundred/baseinfo_hundred.py
+++ b/comData/newlist/hundred/baseinfo_hundred.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import time
+import requests
+import json
+from kafka import KafkaProducer
+from BaseCore import BaseCore
+from getQccId import find_id_by_name
+baseCore = BaseCore()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+log = baseCore.getLogger()
+# 通过企查查id获取企业基本信息
+def info_by_id(com_id,com_name):
+    aa_dict_list = []
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
+    resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(2)
+    com_jc_name = ''
+    try:
+        result_dict = resp_dict['result']['Company']
+    except:
+        log.info(com_name + ":获取失败===========重新放入redis")
+        baseCore.rePutIntoR('hundred:baseinfo',com_name)
+        return aa_dict_list
+    company_name = result_dict['Name']
+    CreditCode = result_dict['CreditCode']
+    if CreditCode is None:
+        CreditCode = ''
+    try:
+        OperName = result_dict['Oper']['Name']
+    except:
+        OperName = ''
+    if OperName is None:
+        OperName = ''
+    if baseCore.str_have_num(OperName):
+        OperName = ''
+    try:
+        Status = result_dict['ShortStatus']
+    except:
+        Status = ''
+    if Status is None:
+        Status = ''
+    try:
+        StartDate = result_dict['StartDate']
+    except:
+        StartDate = ''
+    if StartDate is None:
+        StartDate = ''
+    try:
+        RegistCapi = result_dict['RegistCapi']
+    except:
+        RegistCapi = ''
+    if RegistCapi is None:
+        RegistCapi = ''
+    RecCap = ''  # result_dict['RecCap']  #实际缴纳金额，现已没有显示
+    if RecCap is None:
+        RecCap = ''
+    try:
+        OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2]  # 组织机构代码，现已没有显示
+    except:
+        OrgNo = ''
+    if OrgNo is None:
+        OrgNo = ''
+    try:
+        TaxNo = result_dict['TaxNo']
+    except:
+        TaxNo = ''
+    if TaxNo is None:
+        TaxNo = ''
+    try:
+        EconKind = result_dict['EconKind']
+    except:
+        EconKind = ''
+    if EconKind is None:
+        EconKind = ''
+    TermStart = ''  # result_dict['TermStart']  营业期限自，现已没有显示
+    if TermStart is None:
+        TermStart = ''
+    TeamEnd = ''  # result_dict['TeamEnd']营业期限至，现已没有显示
+    if TeamEnd is None:
+        TeamEnd = ''
+    try:
+        SubIndustry = result_dict['Industry']['SubIndustry']
+    except:
+        SubIndustry = ''
+    if SubIndustry is None:
+        SubIndustry = ''
+    try:
+        Province = result_dict['Area']['Province']
+    except:
+        Province = ''
+    try:
+        City = result_dict['Area']['City']
+    except:
+        City = ''
+    try:
+        County = result_dict['Area']['County']
+    except:
+        County = ''
+    try:
+        region = Province + City + County
+    except:
+        region = ''
+    BelongOrg = ''  # result_dict['BelongOrg']登记机关，现已没有显示
+    can_bao = ''
+    CommonList = []  # result_dict['CommonList']参保人数，现已没有显示
+    for Common_dict in CommonList:
+        try:
+            KeyDesc = Common_dict['KeyDesc']
+        except:
+            continue
+        if KeyDesc == '参保人数':
+            can_bao = Common_dict['Value']
+    if can_bao == '0':
+        can_bao = ''
+    OriginalName = ''
+    try:
+        OriginalName_lists = result_dict['OriginalName']
+        for OriginalName_dict in OriginalName_lists:
+            OriginalName += OriginalName_dict['Name'] + ' '
+    except:
+        OriginalName = ''
+    try:
+        OriginalName.strip()
+    except:
+        OriginalName = ''
+    EnglishName = ''  # result_dict['EnglishName']企业英文名，现已没有显示
+    if EnglishName is None:
+        EnglishName = ''
+    IxCode = ''  # result_dict['IxCode']进出口企业代码，现已没有显示
+    if IxCode is None:
+        IxCode = ''
+    Address = result_dict['Address']
+    if Address is None:
+        Address = ''
+    Scope = ''  # result_dict['Scope']经营范围，现已没有显示
+    if Scope is None:
+        Scope = ''
+    try:
+        PhoneNumber = result_dict['companyExtendInfo']['Tel']
+    except:
+        PhoneNumber = ''
+    if PhoneNumber is None:
+        PhoneNumber = ''
+    try:
+        WebSite = result_dict['companyExtendInfo']['WebSite']
+    except:
+        WebSite = None
+    if WebSite is None:
+        try:
+            WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
+        except:
+            WebSite = ''
+    try:
+        Email = result_dict['companyExtendInfo']['Email']
+    except:
+        Email = ''
+    if Email is None:
+        Email = ''
+    try:
+        Desc = result_dict['companyExtendInfo']['Desc']
+    except:
+        Desc = ''
+    if Desc is None:
+        Desc = ''
+    try:
+        Info = result_dict['companyExtendInfo']['Info']
+    except:
+        Info = ''
+    if Info is None:
+        Info = ''
+    company_name = baseCore.hant_2_hans(company_name)
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
+                                                                                                         com_id)
+    resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
+    time.sleep(1)
+    try:
+        com2 = resp_dict2['result']['Company']
+    except:
+        com2 = ''
+    try:
+        Scope = com2['Scope']
+    except:
+        Scope = ''
+    try:
+        CheckDate = com2['CheckDate']
+    except:
+        CheckDate = ''
+    if CheckDate is None:
+        CheckDate = ''
+    try:
+        TaxpayerType = com2['TaxpayerType']     #纳税人资质
+    except:
+        TaxpayerType = ''
+    if TaxpayerType is None:
+        TaxpayerType = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    if No is None:
+        No = ''
+    try:
+        IxCode = com2['IxCode']
+    except:
+        IxCode = ''
+    try:
+        OrgNo = com2['OrgNo']
+    except:
+        OrgNo = ''
+    try:
+        for Common_t in com2['CommonList']:
+            try:
+                if Common_t['KeyDesc'] == '参保人数':
+                    can_bao = Common_t['Value']
+            except:
+                pass
+    except:
+        can_bao = ''
+    try:
+        TermStart = com2['TermStart']
+    except:
+        TermStart = ''
+    try:
+        TeamEnd = com2['TeamEnd']
+    except:
+        TeamEnd = ''
+    try:
+        RecCap = com2['RecCap']
+    except:
+        RecCap = ''
+    try:
+        No = com2['No']
+    except:
+        No = ''
+    try:
+        SubIndustry = com2['IndustryArray'][-1]
+    except:
+        SubIndustry = ''
+    try:
+        BelongOrg = com2['BelongOrg']
+    except:
+        BelongOrg = ''
+    try:
+        EnglishName = com2['EnglishName']
+    except:
+        EnglishName = ''
+    aa_dict = {
+        'qccId': com_id,  # 企查查企业id
+        'name': company_name,  # 企业名称
+        'shortName': com_jc_name,  # 企业简称
+        'socialCreditCode': CreditCode,  # 统一社会信用代码
+        'legalPerson': OperName,  # 法定代表人
+        'officialPhone': PhoneNumber,  # 电话
+        'officialUrl': WebSite,  # 官网
+        'officialEmail': Email,  # 邮箱
+        'briefInfo': Desc,  # 简介
+        'registerStatus': Status,  # 登记状态
+        'incorporationDate': StartDate,  # 成立日期
+        'capital': RegistCapi,  # 注册资本
+        'paidCapital': RecCap,  # 实缴资本
+        'approvalDate': CheckDate,  # 核准日期
+        'organizationCode': OrgNo,  # 组织机构代码
+        'registerNo': No,  # 工商注册号
+        'taxpayerNo': CreditCode,  # 纳税人识别号
+        'type': EconKind,  # 企业类型
+        'businessStartDate': TermStart,  # 营业期限自
+        'businessEndDate': TeamEnd,  # 营业期限至
+        'taxpayerQualification': TaxpayerType,  # 纳税人资质
+        'industry': SubIndustry,  # 所属行业
+        'region': region,
+        'province': Province,  # 所属省
+        'city': City,  # 所属市
+        'county': County,  # 所属县
+        'registerDepartment': BelongOrg,  # 登记机关
+        'scale': Info,  # 人员规模
+        'insured': can_bao,  # 参保人数
+        'beforeName': OriginalName,  # 曾用名
+        'englishName': EnglishName,  # 英文名
+        'importExportEnterpriseCode': IxCode,  # 进出口企业代码
+        'address': Address,  # 地址
+        'businessRange': Scope,  # 经营范围
+        'status': 0,  # 状态
+    }
+    aa_dict_list.append(aa_dict)
+    log.info(company_name + "：爬取完成")
+    return aa_dict_list
+if __name__ == '__main__':
+    taskType = '基本信息/企查查/单项双百企业冠军'
+    headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-Timestamp': '',
+        'Qcc-Version': '1.0.0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+        'content-type': 'application/json',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br,'
+    }
+    list_weicha = []
+    name_list = []
+    #从redis里拿数据
+    while True:
+        # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
+        token = baseCore.GetToken()
+        if token:
+            pass
+        else:
+            log.info('==========已无token==========')
+            time.sleep(30)
+            continue
+        # list_all_info = []
+        start_time = time.time()
+        # 获取企业信息
+        com_name = baseCore.redicPullData('hundred:baseinfo')
+        # com_name = '卓新市万达铸业有限公司'
+        if com_name == '' or com_name is None:
+            time.sleep(20)
+            continue
+        dic_info = baseCore.getInfomation(com_name)
+        log.info(f'----当前企业{com_name}--开始处理---')
+        social_code = dic_info[5]
+        #企查查id
+        company_id = dic_info[6]
+        #如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
+        if company_id == None:
+            if social_code:
+                company_id = find_id_by_name(start_time,token,social_code)
+            else:
+                company_id = find_id_by_name(start_time,token,com_name)
+            if company_id == 'null':
+                log.info('=====搜索不到该企业====')
+                #todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
+                baseCore.rePutIntoR('hundred:baseinfo', com_name + '：搜索不到')
+                continue
+            if not company_id:
+                log.info(com_name + "：企业ID获取失败===重新放入redis")
+                list_weicha.append(com_name + "：企业ID获取失败")
+                baseCore.rePutIntoR('hundred:baseinfo',com_name)
+                baseCore.delete_token(token)
+                log.info('=====已重新放入redis,失效token已删除======')
+                time.sleep(20)
+                continue
+            else:
+                log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
+                # todo:写入数据库
+                updateqccid = f"update Hundred set qccid = '{company_id}' where CompanyName = '{com_name}'"
+                cursor_.execute(updateqccid)
+                cnx_.commit()
+        try:
+            post_data_list = info_by_id(company_id, com_name)
+        except:
+            log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
+            baseCore.rePutIntoR('hundred:baseInfo', com_name)
+            baseCore.delete_token(token)
+            log.info('=====已重新放入redis,失效token已删除======')
+            continue
+        if post_data_list:
+            pass
+        else:
+            # log.info(f'======{social_code}====企查查token失效====')
+            time.sleep(20)
+            continue
+        for post_data in post_data_list:
+            # list_all_info.append(post_data)
+            if post_data is None:
+                print(com_name + "：企业信息获取失败")
+                list_weicha.append(com_name + "：企业信息获取失败")
+                continue
+            get_name = post_data['name']
+            get_socialcode = post_data['socialCreditCode']
+            #todo:将信用代码更新到表中
+            updatesocialcode = f"update Hundred set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
+            cursor_.execute(updatesocialcode)
+            cnx_.commit()
+            name_compile = {
+                'yuan_name':com_name,
+                'get_name':get_name
+            }
+            name_list.append(name_compile)
+            log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
+                kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
+                print(kafka_result.get(timeout=10))
+            except:
+                exception = 'kafka传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
+                log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
+        # break
+    nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
+    companyName = pd.DataFrame(name_list)
+    companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
+    false_com = pd.DataFrame(list_weicha)
+    false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
--- a/comData/newlist/hundred/getQccId.py
+++ b/comData/newlist/hundred/getQccId.py
+# -*- coding: utf-8 -*-
+import time
+from urllib.parse import quote
+import requests
+import urllib3
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# headers = {
+#         'Host': 'xcx.qcc.com',
+#         'Connection': 'keep-alive',
+#         'Qcc-Platform': 'mp-weixin',
+#         'Qcc-Timestamp': '',
+#         'Qcc-Version': '1.0.0',
+#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
+#         'content-type': 'application/json',
+#         'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
+#         'Accept-Encoding': 'gzip, deflate, br,'
+#     }
+headers = {
+        'Host': 'xcx.qcc.com',
+        'Connection': 'keep-alive',
+        'x-request-device-type': 'Android',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
+        'Content-Type': 'application/json',
+        'Qcc-Version': '1.0.0',
+        'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
+        'xweb_xhr': '1',
+        'xcx-version': '2023.09.27',
+        'Qcc-Platform': 'mp-weixin',
+        'Qcc-CurrentPage': '/company-subpackages/business/index',
+        'Qcc-Timestamp': '1696661787803',
+        'Qcc-RefPage': '/company-subpackages/detail/index',
+        'Accept': '*/*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh'
+}
+# 通过企业名称或信用代码获取企查查id
+def find_id_by_name(start,token,name):
+    urllib3.disable_warnings()
+    qcc_key = name
+    t = str(int(time.time()) * 1000)
+    headers['Qcc-Timestamp'] = t
+    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
+    for lll in range(1, 6):
+        try:
+            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
+            break
+        except Exception as e:
+            print(f'{e}-------------重试')
+            time.sleep(5)
+            continue
+    time.sleep(2)
+    #{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频，请升级小程序版本'}
+    if resp_dict['status']==40101:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    if resp_dict['status']==401:
+        KeyNo = False
+        log.info(f'=======您的账号访问超频，请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
+        return KeyNo
+    try:
+        if resp_dict['result']['Result']:
+            result_dict = resp_dict['result']['Result'][0]
+            KeyNo = result_dict['KeyNo']
+            Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
+            if Name == '':
+                KeyNo = 'null'
+        else:
+            KeyNo = 'null'
+    except:
+        KeyNo = False
+        log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
+        return KeyNo
+    log.info("{}，企业代码为:{}".format(qcc_key, KeyNo))
+    return KeyNo
\ No newline at end of file
--- a/comData/newlist/technological/BaseCore.py
+++ b/comData/newlist/technological/BaseCore.py
@@ -541,7 +541,10 @@ class BaseCore:
        self.cursor.execute(query)
        token_list = self.cursor.fetchall()
        self.cnx.commit()
+        try:
            token = token_list[random.randint(0, len(token_list)-1)][0]
+        except:
+            token = ''
        return token
    # 删除失效的token

--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -11,24 +11,28 @@ import logbook.more
 import pandas as pd
 import requests
 import zhconv
-import pymysql
 import redis
-from docx import Document
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from openpyxl import Workbook
 import langid
 #创建连接池
 import pymysql
-from pymysql import connections
 from DBUtils.PooledDB import PooledDB
 # import sys
 # sys.path.append('D://zzsn_spider//base//fdfs_client')
 from fdfs_client.client import get_tracker_conf, Fdfs_client
-tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
+tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
 client = Fdfs_client(tracker_conf)
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
 class BaseCore:
@@ -437,9 +441,9 @@ class BaseCore:
    #解析word文件页数
-    def doc_page(self,file_path):
+    # def doc_page(self,file_path):
-        doc = Document(file_path)
+    #     doc = Document(file_path)
-        return len(doc.sections)
+    #     return len(doc.sections)
    def pdf_content(self,resp_content):
        # 解析pdf文件内容
        content = ''
@@ -507,9 +511,9 @@ class BaseCore:
        # retData['page_size'] = page_size
        return retData
-    def secrchATT(self,item_id,file_name,type_id):
+    def secrchATT(self,item_id,file_name,type_id,order_by):
-        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
+        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
-        self.cursor_.execute(sel_sql, (item_id, file_name, type_id))
+        self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
        selects = self.cursor_.fetchone()
        return selects
@@ -527,13 +531,8 @@ class BaseCore:
            page_size = retData['page_size']
            create_time = retData['create_time']
            order_by = num
-            selects = self.secrchATT(item_id,file_name,type_id)
-            if selects:
-                self.getLogger().info(f'com_name:{com_name}已存在')
-                id = selects[0]
-                return id,full_path
-            else:
            Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
            values = (
@@ -544,11 +543,71 @@ class BaseCore:
            self.cursor_.execute(Upsql, values)  # 插入
            self.cnx_.commit()  # 提交
            self.getLogger().info("更新完成:{}".format(Upsql))
-                selects = self.secrchATT(item_id,file_name,type_id)
+            selects = self.secrchATT(item_id,file_name,type_id,order_by)
            id = selects[0]
            return id,full_path
+    # 获取文件大小
+    def convert_size(self,size_bytes):
+        # 定义不同单位的转换值
+        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+        i = 0
+        while size_bytes >= 1024 and i < len(units) - 1:
+            size_bytes /= 1024
+            i += 1
+        return f"{size_bytes:.2f} {units[i]}"
+    def uptoOBS(self,file_href,item_id,pathType,file_name):
+        headers = {}
+        category = os.path.splitext(file_href)[1]
+        retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
+                   'full_path': '',
+                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+                   'create_time': '', 'page_size': '', 'content': ''}
+        headers['User-Agent'] = self.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                response = requests.get(file_href, headers=headers, verify=False, timeout=20)
+                file_size = int(response.headers.get('Content-Length'))
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        for i in range(0, 3):
+            try:
+                # name = file_name
+                if category in file_name:
+                    pass
+                else:
+                    file_name = file_name + '.' + category
+                result = obsClient.putContent('zzsn', f'{pathType}' + file_name, content=response.content)
+                break
+            except:
+                time.sleep(3)
+                continue
+        if page_size < 1:
+            # pdf解析失败
+            # print(f'======pdf解析失败=====')
+            return retData
+        else:
+            try:
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                retData['state'] = True
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
+                retData['full_path'] = unquote(result['body']['objectUrl'])
+                retData['file_size'] = self.convert_size(file_size)
+                retData['create_time'] = time_now
+            except Exception as e:
+                print(f'error:{e}')
+                return retData
+            return retData

--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -224,6 +224,7 @@ def get_content1():
                        # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            num+=1
                            log.info('已采集----------跳过')
                            continue
                        try:
@@ -383,6 +384,7 @@ def get_content2():
                        # # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            num+=1
                            log.info('已采集----------跳过')
                            continue
                        try:
@@ -563,6 +565,7 @@ def get_content3():
                pub_time = li.split('<span>[')[1].split(']</span>')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    log.info('已采集----------跳过')
                    continue
                sendContent(href, headers,title,pub_time,num)
@@ -591,6 +594,7 @@ def get_content3():
                    # 判断是否已经爬取过
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        log.info('已采集----------跳过')
                        continue
                    title = doc_item('a').attr('title')
@@ -612,6 +616,7 @@ def get_content3():
 def bei_jing():
    num = 0
    start_time = time.time()
+    pathType = 'policy/beijing/'
    # 有反爬需要使用selenium
    # service = Service(r'D:/chrome/113/chromedriver.exe')
    # 配置selenium
@@ -664,6 +669,7 @@ def bei_jing():
            # 判断是否已经爬取过
            is_href = db_storage.find_one({'网址': href[0]})
            if is_href:
+                num+=1
                log.info('已采集----------跳过')
                continue
            # 对获取信息页面发送请求
@@ -712,7 +718,7 @@ def bei_jing():
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
-                    retData = baseCore.uploadToserver(file_href, '1667')
+                    retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
                    if retData['state']:
                        pass
                    else:
@@ -721,7 +727,7 @@ def bei_jing():
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
-                    file['href'] = 'http://114.115.215.96/' + full_path
+                    file['href'] = full_path
            # id_ = redefid(id_list)
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -754,7 +760,7 @@ def bei_jing():
            # id_list.append(id)
            num += 1
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
        bro.quit()
    except Exception as e:
        log.info(e)
@@ -763,6 +769,7 @@ def bei_jing():
 # 内蒙古
 def nei_meng_gu():
    start = time.time()
+    pathType = 'policy/neimenggu/'
    num = 0
    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
    try:
@@ -780,6 +787,7 @@ def nei_meng_gu():
            # todo:测试用 注释掉判重
            is_href = db_storage.find_one({'网址': real_href})
            if is_href:
+                num+=1
                continue
            try:
                # 获取所需信息
@@ -831,16 +839,16 @@ def nei_meng_gu():
                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
+                            # print(fu_jian_href)
                            # todo:附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1669')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
                            id_list.append(att_id)
-                            # # todo:将返回的地址更新到soup
-                            # fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
                print(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -881,6 +889,7 @@ def nei_meng_gu():
 # 吉林
 def ji_lin():
+    pathType = 'policy/jilin/'
    start = time.time()
    num = 0
    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
@@ -902,6 +911,7 @@ def ji_lin():
            title = a.find('a').text.replace('\n', '')
            is_href = db_storage.find_one({'网址': real_href})
            if is_href:
+                num+=1
                continue
            try:
                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
@@ -972,16 +982,17 @@ def ji_lin():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                file_name = fu_jian_href.text.strip()
-                                retData = baseCore.uploadToserver(fu_jian_href, '1670')
+                                # print(fu_jian_href)
+                                retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
                                id_list.append(att_id)
+                                #
-                                # todo:将返回的地址更新到soup
+                                # # todo:将返回的地址更新到soup
-                                li.find('a')['href'] = 'http://114.115.215.96/' + full_path
+                                li.find('a')['href'] = full_path
                            else:
                                continue
                else:
@@ -1009,16 +1020,17 @@ def ji_lin():
                        if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
-                            retData = baseCore.uploadToserver(fj_href, '1670')
+                            # print(fj_href)
+                            retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
                            id_list.append(att_id)
+                            #
-                            # todo:将返回的地址更新到soup
+                            # # todo:将返回的地址更新到soup
-                            fu_jian_href['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian_href['href'] = full_path
                        else:
                            continue
@@ -1062,7 +1074,7 @@ def ji_lin():
                        save_data(dic_news)
                    num = num + 1
            except Exception as e:
-                print(e)
+                log.info(e)
                pass
    except:
        pass
@@ -1073,6 +1085,7 @@ def ji_lin():
 def shang_hai():
    start = time.time()
+    pathType = 'policy/shanghai/'
    num = 0
    for page in range(1, 7):
@@ -1095,6 +1108,7 @@ def shang_hai():
                    href = 'https://www.gzw.sh.gov.cn' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
@@ -1154,7 +1168,7 @@ def shang_hai():
                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
-                            retData = baseCore.uploadToserver(fu_jian_href, '1671')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -1163,7 +1177,7 @@ def shang_hai():
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            a['href'] = 'http://114.115.215.96/' + full_path
+                            a['href'] = full_path
                        else:
                            continue
@@ -1205,6 +1219,7 @@ def shang_hai():
 # 浙江
 def zhe_jiang():
    start = time.time()
+    pathType = 'policy/zhejiang/'
    num = 0
    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
    try:
@@ -1227,6 +1242,7 @@ def zhe_jiang():
                href = 'http://gzw.zj.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
+                num+=1
                continue
            try:
                href_text = requests.get(url=href, headers=headers, verify=False)
@@ -1325,6 +1341,7 @@ def zhe_jiang():
 # 福建
 def fu_jian():
    error_tag = str(404)
+    pathType = 'policy/fujian/'
    num = 0
    start_time = time.time()
    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
@@ -1373,6 +1390,7 @@ def fu_jian():
                # print(real_href)
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # 文章是远程pdf
@@ -1384,7 +1402,7 @@ def fu_jian():
                        content = baseCore.pdf_content(resp_content)
                        contentwithtag = ''
                        # 文件上传至服务器
-                        retData = baseCore.uploadToserver(real_href, '1673')
+                        retData = baseCore.uptoOBS(real_href, '1673',pathType,title)
                        if retData['state']:
                            pass
                        else:
@@ -1420,7 +1438,7 @@ def fu_jian():
                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                                    # 找到附件后 上传至文件服务器
-                                    retData = baseCore.uploadToserver(fj_href, '1673')
+                                    retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -1428,7 +1446,7 @@ def fu_jian():
                                    att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                            source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
                            pub_source = source_.split('来源：')[1].split('发布时间：')[0].strip().lstrip()
@@ -1499,6 +1517,7 @@ def shan_dong():
                href = li.find('a')['href']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -1593,6 +1612,7 @@ def shan_dong():
 # 广东
 def guang_dong():
    start = time.time()
+    pathType = 'policy/guangdong/'
    num = 0
    url = 'http://gzw.gd.gov.cn/zcfg/index.html'
    try:
@@ -1620,6 +1640,7 @@ def guang_dong():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -1644,7 +1665,7 @@ def guang_dong():
                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                            # 附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fj_href, '1676')
+                            retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -1652,7 +1673,7 @@ def guang_dong():
                            att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num)
                            id_list.append(att_id)
                            # 将文件服务器的链接替换
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -1692,6 +1713,7 @@ def guang_dong():
 # 海南
 def hai_nan():
+    pathType = 'policy/hainan/'
    def hai_nan1():
        # 部门文件
        num = 0
@@ -1717,6 +1739,7 @@ def hai_nan():
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        try:
@@ -1759,7 +1782,7 @@ def hai_nan():
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    # 上传至文件服务器
-                                    retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                    retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -1767,7 +1790,7 @@ def hai_nan():
                                    att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                        except:
                            try:
                                # print(href)
@@ -1801,7 +1824,7 @@ def hai_nan():
                                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                            # print(f'----附件：{fu_jian_href}-----filename:{file_name}')
                                            # 附件上传至文件服务器
-                                            retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                            retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                            if retData['state']:
                                                pass
                                            else:
@@ -1809,7 +1832,7 @@ def hai_nan():
                                            # 更新到数据库
                                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                            id_list.append(att_id)
-                                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                            fu_jian['href'] = full_path
                                except:
                                    continue
@@ -1888,6 +1911,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -1959,6 +1983,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -2007,7 +2032,7 @@ def hai_nan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -2015,7 +2040,7 @@ def hai_nan():
                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                            # print(f'附件：{fu_jian_href}')
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -2065,6 +2090,7 @@ def hai_nan():
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -2113,14 +2139,14 @@ def hai_nan():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                # 上传至文件服务器
-                                retData = baseCore.uploadToserver(fu_jian_href, '1677')
+                                retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                id_list.append(att_id)
-                                fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                fu_jian['href'] = full_path
                                print(f'----附件：{fu_jian_href}')
                    else:
                        pass
@@ -2175,10 +2201,13 @@ def hai_nan():
                try:
                    is_href = db_storage.find_one({'网址': i_href})
                    if is_href:
+                        num+=1
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
+                        num+=1
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
+                        num+=1
                        continue
                    # print(f'中央----{i_href}----')
                    href_text = requests.get(url=i_href, headers=headers, verify=False)
@@ -2330,6 +2359,7 @@ def hai_nan():
 # 四川
 def si_chuan():
    num = 0
+    pathType = 'policy/sichuan/'
    start_time = time.time()
    for page in range(1, 3):
        if page == 1:
@@ -2349,9 +2379,10 @@ def si_chuan():
                    href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
-                    print(href)
+                    # print(href)
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
@@ -2374,14 +2405,14 @@ def si_chuan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 对附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1678')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
                            if retData['stste']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num)
                            id_list.append(att_id)
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                            # fu_jian_href_list.append(fu_jian_href)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -2423,6 +2454,7 @@ def si_chuan():
 # 广西
 def guang_xi():
    num = 0
+    pathType = 'policy/guangxi/'
    start_time = time.time()
    url_all = """
    http://gzw.gxzf.gov.cn/wjzx/2023nwj/  1
@@ -2463,6 +2495,7 @@ def guang_xi():
                    href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # print(href)
@@ -2498,7 +2531,7 @@ def guang_xi():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                # 附件上传至文件服务器
-                                retData = baseCore.uploadToserver(fu_jian_href, '1692')
+                                retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -2507,7 +2540,7 @@ def guang_xi():
                                att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # 将附件链接替换
-                                fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                fu_jian['href'] = full_path
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
@@ -2550,6 +2583,7 @@ def gui_zhou():
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/  11
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/  1
    """
+    pathType = 'policy/guizhou/'
    num = 0
    start_time = time.time()
    for page in range(0, 11):
@@ -2566,6 +2600,7 @@ def gui_zhou():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    # print(href)
@@ -2606,7 +2641,7 @@ def gui_zhou():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            # 附件上传至文件服务器
-                            retData = baseCore.uploadToserver(fu_jian_href, '1694')
+                            retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -2615,7 +2650,7 @@ def gui_zhou():
                            att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num)
                            id_list.append(att_id)
                            # 将附件链接替换
-                            fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                            fu_jian['href'] = full_path
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -2655,6 +2690,7 @@ def gui_zhou():
 # 云南
 def yun_nan():
+    pathType = 'policy/yunnan/'
    def yun_nan1():
        """
        http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml  9
@@ -2679,6 +2715,7 @@ def yun_nan():
                        href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        fu_jian_href_list = []
@@ -2710,7 +2747,7 @@ def yun_nan():
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    try:
                                        # 附件上传至文件服务器
-                                        retData = baseCore.uploadToserver(fu_jian_href, '1679')
+                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
                                        if retData['state']:
                                            pass
                                        else:
@@ -2719,7 +2756,7 @@ def yun_nan():
                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
                                        id_list.append(att_id)
                                        # 将附件链接替换
-                                        fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                        fu_jian['href'] = full_path
                                    except:
                                        continue
                            href_resp.close()
@@ -2788,6 +2825,7 @@ def yun_nan():
                    href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        print(href)
@@ -2822,7 +2860,7 @@ def yun_nan():
                                    print(fu_jian_href)
                                    try:
                                        # 附件上传至文件服务器
-                                        retData = baseCore.uploadToserver(fu_jian_href, '1679')
+                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
                                        if retData['state']:
                                            pass
                                        else:
@@ -2831,7 +2869,7 @@ def yun_nan():
                                        att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
                                        id_list.append(att_id)
                                        # 将附件链接替换
-                                        fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                        fu_jian['href'] = full_path
                                    except:
                                        continue
                            res_.close()
@@ -2890,6 +2928,7 @@ def chong_qing():
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
    """
    num = 0
+    pathType = 'policy/chongqing/'
    start_time = time.time()
    for page in range(0, 4):
        if page == 0:
@@ -2913,6 +2952,7 @@ def chong_qing():
                        href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        print(href)
@@ -2960,7 +3000,7 @@ def chong_qing():
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                try:
                                    # 附件上传至文件服务器
-                                    retData = baseCore.uploadToserver(fu_jian_href, '1693')
+                                    retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -2969,7 +3009,7 @@ def chong_qing():
                                    att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num)
                                    id_list.append(att_id)
                                    # 将附件链接替换
-                                    fu_jian['href'] = 'http://114.115.215.96/' + full_path
+                                    fu_jian['href'] = full_path
                                except:
                                    continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -3011,6 +3051,7 @@ def chong_qing():
 # 天津
 def tian_jin():
+    pathType = 'policy/tianjin/'
    def tian_jin1():
        num = 0
        start_time = time.time()
@@ -3038,6 +3079,7 @@ def tian_jin():
                        href = i_href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
@@ -3082,7 +3124,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3090,7 +3132,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3160,6 +3202,7 @@ def tian_jin():
                        href = url.split('index')[0] + href.replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
@@ -3205,7 +3248,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3213,7 +3256,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3284,6 +3327,7 @@ def tian_jin():
                        href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
@@ -3332,7 +3376,7 @@ def tian_jin():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1683')
+                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3340,7 +3384,7 @@ def tian_jin():
                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3388,6 +3432,7 @@ def tian_jin():
 # 新疆
 def xin_jiang():
+    pathType = 'policy/xinjiang/'
    def xin_jiang1():
        num = 0
        start_time = time.time()
@@ -3407,6 +3452,7 @@ def xin_jiang():
                        continue
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    #         href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
                    try:
@@ -3432,7 +3478,7 @@ def xin_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1682')
+                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3440,7 +3486,7 @@ def xin_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3509,6 +3555,7 @@ def xin_jiang():
                        href = 'http://gyzc.xjbt.gov.cn' + href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        href_res = requests.get(url=href, headers=headers, verify=False)
@@ -3530,7 +3577,7 @@ def xin_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1682')
+                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3538,7 +3585,7 @@ def xin_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        if len(contentWithTag) < 1:
@@ -3594,6 +3641,7 @@ def xin_jiang():
 # 山西
 def shan_xi():
+    pathType = 'policy/shanxi/'
    num = 0
    start_time = time.time()
    for page in range(1, 7):
@@ -3618,6 +3666,7 @@ def shan_xi():
                publishDate = tr.xpath('./td[2]/span/text()')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    if ".pdf" in href:
@@ -3648,7 +3697,7 @@ def shan_xi():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1684')
+                            retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3656,7 +3705,7 @@ def shan_xi():
                            att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    if len(contentWithTag) < 1:
@@ -3707,6 +3756,7 @@ def shan_xi():
 # 辽宁
 def liao_ning():
+    pathType = 'policy/liaoning/'
    num = 0
    start_time = time.time()
    for page in range(1, 3):
@@ -3727,6 +3777,7 @@ def liao_ning():
                        href = 'https://gzw.ln.gov.cn/' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -3758,7 +3809,7 @@ def liao_ning():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1685')
+                            retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3766,7 +3817,7 @@ def liao_ning():
                            att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    if len(contentWithTag) < 1:
@@ -3816,6 +3867,7 @@ def liao_ning():
 # 黑龙江
 def hei_long_jiang():
+    pathType = 'policy/heilongjiang/'
    num = 0
    start_time = time.time()
    for page in range(1, 3):
@@ -3837,6 +3889,7 @@ def hei_long_jiang():
                        pub_hao = ''
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        contentWithTag = text['data']['results'][row]['contentHtml']
@@ -3861,7 +3914,7 @@ def hei_long_jiang():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1687')
+                                retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -3869,7 +3922,7 @@ def hei_long_jiang():
                                att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -3912,6 +3965,7 @@ def hei_long_jiang():
 # 江苏
 def jiang_su():
    num = 0
+    pathType = 'policy/jiangsu/'
    start_time = time.time()
    pagestart = 1
    pageend = 45
@@ -3940,6 +3994,7 @@ def jiang_su():
                title = a.text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
@@ -3967,7 +4022,7 @@ def jiang_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1687')
+                            retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -3975,7 +4030,7 @@ def jiang_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4022,6 +4077,7 @@ def jiang_su():
 # 安徽
 def an_hui():
+    pathType = 'policy/anhui/'
    def an_hui1():
        num = 0
        start_time = time.time()
@@ -4037,6 +4093,7 @@ def an_hui():
                    href = doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False)
@@ -4068,7 +4125,7 @@ def an_hui():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1688')
+                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -4076,7 +4133,7 @@ def an_hui():
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -4164,7 +4221,7 @@ def an_hui():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1688')
+                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -4172,7 +4229,7 @@ def an_hui():
                                att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        contentWithTag = str(soup.prettify())
                        content = soup.text
@@ -4223,6 +4280,7 @@ def jiang_xi():
    121-164
    """
    num = 0
+    pathType = 'policy/jiangxi/'
    start_time = time.time()
    startrecord = 1
    endrecord = 60
@@ -4248,6 +4306,7 @@ def jiang_xi():
            for href in href_list:
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
@@ -4289,7 +4348,7 @@ def jiang_xi():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1689')
+                            retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4297,7 +4356,7 @@ def jiang_xi():
                            att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4346,6 +4405,7 @@ def jiang_xi():
 # 河南
 def he_nan():
    num = 0
+    pathType = 'policy/henan/'
    start_time = time.time()
    for page in range(0, 7):
        if page == 0:
@@ -4361,6 +4421,7 @@ def he_nan():
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                href_res = requests.get(url=href, headers=headers, verify=False)
                href_res.encoding = href_res.apparent_encoding
@@ -4383,7 +4444,7 @@ def he_nan():
                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
-                        retData = baseCore.uploadToserver(file_href, '1690')
+                        retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
                        if retData['state']:
                            pass
                        else:
@@ -4391,7 +4452,7 @@ def he_nan():
                        att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num)
                        id_list.append(att_id)
                        # todo:将返回的地址更新到soup
-                        file['href'] = 'http://114.115.215.96/' + full_path
+                        file['href'] =  full_path
                contentWithTag = str(soup.prettify())
                content = soup.text
@@ -4438,6 +4499,7 @@ def he_nan():
 # 湖南
 def hu_nan():
    num = 0
+    pathType = 'policy/hunan/'
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
@@ -4454,6 +4516,7 @@ def hu_nan():
                publishDate = doc_item('td:nth-child(3)').text()
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
                try:
@@ -4490,7 +4553,7 @@ def hu_nan():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1691')
+                            retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4498,7 +4561,7 @@ def hu_nan():
                            att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4538,6 +4601,7 @@ def hu_nan():
 # 甘肃
 def gan_su():
+    pathType = 'policy/gansu/'
    def gan_su1():
        num = 0
        start_time = time.time()
@@ -4581,6 +4645,7 @@ def gan_su():
                    publishDate = dd['publishDate']
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    for i in range(0, 4):
                        bro.get(href)
@@ -4609,7 +4674,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4617,7 +4682,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] =  full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4688,6 +4753,7 @@ def gan_su():
                    publishDate = dd['publishDate']
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        num+=1
                        continue
                    bro.get(href)
                    try:
@@ -4743,7 +4809,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4751,7 +4817,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4849,6 +4915,7 @@ def gan_su():
                publishDate = dd['publishDate']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    bro.get(href)
@@ -4900,7 +4967,7 @@ def gan_su():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1696')
+                            retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -4908,7 +4975,7 @@ def gan_su():
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -4958,6 +5025,7 @@ def gan_su():
 # 宁夏
 def ning_xia():
    num = 0
+    pathType = 'policy/ningxia/'
    start_time = time.time()
    for page in range(0, 3):
        if page == 0:
@@ -4976,6 +5044,7 @@ def ning_xia():
                publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
@@ -5001,7 +5070,7 @@ def ning_xia():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1697')
+                            retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -5009,7 +5078,7 @@ def ning_xia():
                            att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -5052,6 +5121,7 @@ def ning_xia():
 # 陕西
 def shanxi():
    num = 0
+    pathType = 'policy/shan_xi/'
    start_time = time.time()
    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
@@ -5072,6 +5142,7 @@ def shanxi():
                href = 'https://sxgz.shaanxi.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
+                num+=1
                continue
            try:
                res_href = requests.get(url=href, headers=headers)
@@ -5101,7 +5172,7 @@ def shanxi():
                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
-                        retData = baseCore.uploadToserver(file_href, '1680')
+                        retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
                        if retData['state']:
                            pass
                        else:
@@ -5109,7 +5180,7 @@ def shanxi():
                        att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num)
                        id_list.append(att_id)
                        # todo:将返回的地址更新到soup
-                        file['href'] = 'http://114.115.215.96/' + full_path
+                        file['href'] = full_path
                # id_ = redefid(id_list)
                contentWithTag = str(soup.prettify())
                content = soup.text
@@ -5152,6 +5223,7 @@ def shanxi():
 # 西藏
 def xi_zang():
    start_time = time.time()
+    pathType = 'policy/xizang/'
    url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
                'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
    for url in url_list:
@@ -5169,6 +5241,7 @@ def xi_zang():
                title = li.find('a').text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    num+=1
                    continue
                try:
                    res_href = requests.get(url=href, headers=headers)
@@ -5194,7 +5267,7 @@ def xi_zang():
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
-                            retData = baseCore.uploadToserver(file_href, '1695')
+                            retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
                            if retData['state']:
                                pass
                            else:
@@ -5202,7 +5275,7 @@ def xi_zang():
                            att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
-                            file['href'] = 'http://114.115.215.96/' + full_path
+                            file['href'] = full_path
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    # todo:替换完成之后，将附件上传至文件服务器
@@ -5242,6 +5315,7 @@ def xi_zang():
 # 青海
 def qing_hai():
+    pathType = 'policy/qinghai/'
    def qing_hai1():
        num = 0
        start_time = time.time()
@@ -5259,6 +5333,7 @@ def qing_hai():
                    durl = tr.find('a').get('href')
                    is_href = db_storage.find_one({'网址': durl})
                    if is_href:
+                        num+=1
                        log.info('已采集----------跳过')
                        continue
                    title = tr.find('a').text
@@ -5297,7 +5372,7 @@ def qing_hai():
                                att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
-                                file['href'] = 'http://114.115.215.96/' + full_path
+                                file['href'] = full_path
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        # todo:替换完成之后，将附件上传至文件服务器
@@ -5659,41 +5734,41 @@ def hu_bei():
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
 if __name__ == '__main__':
-    get_content1()
+    # get_content1()
-    get_content2()
+    # get_content2()
-    get_content3()
+    # get_content3()
-    bei_jing()
+    # bei_jing()
-    nei_meng_gu()
+    # nei_meng_gu()
    ji_lin()
-    shang_hai()
+    # shang_hai()
-    zhe_jiang()
+    # zhe_jiang()
-    fu_jian()
+    # fu_jian()
-    shan_dong()
+    # shan_dong()
-    guang_dong()
+    # guang_dong()
-    hai_nan()
+    # hai_nan()
-    si_chuan()
+    # si_chuan()
-    guang_xi()
+    # guang_xi()
-    gui_zhou()
+    # gui_zhou()
-    yun_nan()
+    # yun_nan()
-    chong_qing()
+    # chong_qing()
-    tian_jin()
+    # tian_jin()
-    xin_jiang()
+    # xin_jiang()
-    shan_xi()
+    # shan_xi()
-    liao_ning()
+    # liao_ning()
-    hei_long_jiang()
+    # hei_long_jiang()
-    jiang_su()
+    # jiang_su()
-    an_hui()
+    # an_hui()
-    jiang_xi()
+    # jiang_xi()
-    he_nan()
+    # he_nan()
-    hu_nan()
+    # hu_nan()
-    gan_su()
+    # gan_su()
-    ning_xia()
+    # ning_xia()
-    xi_zang()
+    # xi_zang()
-    shanxi()
+    # shanxi()
-    qing_hai()
+    # qing_hai()
-    he_bei()
+    # he_bei()
-    qing_hai()
+    # qing_hai()
-    current_time = datetime.datetime.now()
+    # current_time = datetime.datetime.now()
-    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
-    sleep_seconds = (midnight_time - current_time).total_seconds()
+    # sleep_seconds = (midnight_time - current_time).total_seconds()
-    time.sleep(sleep_seconds)
+    # time.sleep(sleep_seconds)
--- a/comData/sinafinance_news/nyse_news_gn.py
+++ b/comData/sinafinance_news/nyse_news_gn.py
+"""
+    新浪财经国内企业动态
+"""
+import json
+import re
+import time
+import jieba
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from retry import retry
+from base.smart import smart_extractor
+from base.BaseCore import BaseCore
+# 初始化，设置中文分词
+jieba.cut("必须加载jieba")
+smart = smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+r = baseCore.r
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache'
+}
+taskType = '企业动态/新浪财经'
+pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
+# 获取响应页面
+@retry(tries=3, delay=1)
+def getrequests(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers,proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+# 解析内容
+def getDic(social_code, title, href, pub_time):
+    start_time = time.time()
+    if 'http' not in href:
+        href = 'https://finance.sina.com.cn' + href
+    href_ = href.replace('https', 'http')
+    try:
+        # 带标签正文
+        contentText = smart.extract_by_url(href_).text
+        # 不带标签正文
+        content = smart.extract_by_url(href_).cleaned_text
+        if content == '':
+            log.error(f'{href}===页面解析失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+            return 0
+    except:
+        log.error(f'{href}===页面解析失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+        return 0
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': '',
+        'author': '',
+        'content': content,
+        'contentWithTag': contentText,
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': 'zh',
+        'origin': '新浪财经',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': href,  # 原文链接
+        'summary': '',
+        'title': title,
+        'type': 2,
+        'socialCreditCode': social_code,
+        'year': pub_time[:4]
+    }
+    # print(dic_news)
+    try:
+        sendKafka(dic_news, start_time)
+        log.info(f'Kafka发送成功')
+        try:
+            insertMysql(social_code, href)
+            log.info(f'数据库保存成功')
+        except:
+            log.error(f'{href}===数据入库失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
+    except:
+        log.error(f'{href}===发送Kafka失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    return 1
+# 数据发送至Kafka
+@retry(tries=3, delay=1)
+def sendKafka(dic_news, start_time):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    kafka_result = producer.send("researchReportTopic",
+                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+    print(kafka_result.get(timeout=10))
+    dic_result = {
+        'success': 'ture',
+        'message': '操作成功',
+        'code': '200',
+    }
+    log.info(dic_result)
+    # 传输成功,写入日志中
+    state = 1
+    takeTime = baseCore.getTimeCost(start_time, time.time())
+    baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
+# 数据保存入库，用于判重
+@retry(tries=3, delay=1)
+def insertMysql(social_code, link):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+    # 动态信息列表
+    list_info = [
+        social_code,
+        link,
+        '新浪财经',
+        '2',
+    ]
+    cursor.execute(insert_sql, tuple(list_info))
+    cnx.commit()
+# 判断动态是否采集过
+@retry(tries=3, delay=1)
+def selectUrl(url, social_code):
+    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
+    cursor.execute(sel_sql, (url, social_code))
+    selects = cursor.fetchone()
+    return selects
+def doJob():
+    while True:
+        start_time = time.time()
+        social_code = baseCore.redicPullData('NewsEnterprise:gnqy_nyse_socialCode')
+        # social_code = '914403007261824992'
+        if not social_code or social_code == 'None':
+            print(f'============已没有数据============等待===============')
+            time.sleep(1800)
+        data = baseCore.getInfomation(social_code)
+        gpdm = data[3]
+        log.info(f'{social_code}==={gpdm}===开始采集')
+        exchange = data[10]
+        if gpdm == '' or not gpdm:
+            log.error(f'{social_code}===股票代码为空')
+            continue
+        # 根据所在交易所不同，修改股票代码
+        if exchange == 1:
+            gpdm_ = 'bj' + gpdm
+        elif exchange == 2:
+            gpdm_ = 'sh' + gpdm
+        elif exchange == 3:
+            gpdm_ = 'sz' + gpdm
+        else:
+            log.info(f'{social_code}==={gpdm}===不在北京、上海、深圳交易所')
+            continue
+        page = 1
+        num_ok = 0
+        num_error =0
+        while True:
+            url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={gpdm_}&Page={page}'
+            soup = getrequests(url)
+            if '拒绝访问' in soup.text:
+                log.error(f'{social_code}===ip封禁')
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+                r.rpush('NewsEnterprise:gnqy_nyse_socialCode',social_code)
+                time.sleep(1800)
+                break
+            try:
+                ul = soup.find('div', class_='datelist').find('ul')
+                a_list = ul.find_all('a')
+                time_list = re.findall(pattern, str(ul))
+                for i in range(len(a_list)):
+                    try:
+                        title = a_list[i].text.lstrip().strip()
+                        if title == '':
+                            continue
+                        href = a_list[i].get('href')
+                        selects = selectUrl(href,social_code)
+                        if selects:
+                            log.info(f'{href}===已采集')
+                            continue
+                        if 'http' not in href:
+                            href = 'https://finance.sina.com.cn' + href
+                        pub_time = time_list[i].replace('\xa0', ' ') + ":00"
+                        flg = getDic(social_code,title,href,pub_time)
+                        if flg == 0:
+                            num_error += 1
+                        else:
+                            num_ok += 1
+                        time.sleep(0.5)
+                    except Exception as e:
+                        ee = e.__traceback__.tb_lineno
+                        log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                    break
+            except:
+                log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+            next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
+            if '下一页' not in next_flg:
+                break
+            page += 1
+            break
+        log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+if __name__ == "__main__":
+    doJob()
--- a/comData/sinafinance_news/nyse_news_xg.py
+++ b/comData/sinafinance_news/nyse_news_xg.py
+"""
+    新浪财经香港企业动态
+"""
+from datetime import datetime
+import json
+import re
+import time
+import jieba
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from retry import retry
+from base.smart import smart_extractor
+from base.BaseCore import BaseCore
+# 初始化，设置中文分词
+jieba.cut("必须加载jieba")
+smart = smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+r = baseCore.r
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache'
+}
+taskType = '企业动态/新浪财经'
+# 判断时间是否是正确格式
+def format_time(time_str):
+    try:
+        # 尝试将时间字符串按指定格式解析为datetime对象
+        datetime_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+        # 检查解析后的时间对象是否与原字符串完全匹配
+        if datetime_obj.strftime("%Y-%m-%d %H:%M:%S") == time_str:
+            return time_str
+    except ValueError:
+        pass
+    # 如果无法解析为指定格式，则格式化为"%Y-%m-%d %H:%M:%S"
+    formatted_time = datetime.strftime(datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
+    return formatted_time
+# 获取响应页面
+@retry(tries=3, delay=1)
+def getrequests(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers,proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+# 解析内容
+def getDic(social_code, title, href, pub_time):
+    start_time = time.time()
+    if 'http' not in href:
+        href = 'https://finance.sina.com.cn' + href
+    href_ = href.replace('https', 'http')
+    try:
+        # 带标签正文
+        contentText = smart.extract_by_url(href_).text
+        # 不带标签正文
+        content = smart.extract_by_url(href_).cleaned_text
+        if content == '':
+            log.error(f'{href}===页面解析失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+            return 0
+    except:
+        log.error(f'{href}===页面解析失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+        return 0
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': '',
+        'author': '',
+        'content': content,
+        'contentWithTag': contentText,
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': 'zh',
+        'origin': '新浪财经',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': href,  # 原文链接
+        'summary': '',
+        'title': title,
+        'type': 2,
+        'socialCreditCode': social_code,
+        'year': pub_time[:4]
+    }
+    # print(dic_news)
+    # try:
+    #     sendKafka(dic_news, start_time)
+    #     log.info(f'Kafka发送成功')
+    #     try:
+    #         insertMysql(social_code, href)
+    #         log.info(f'数据库保存成功')
+    #     except:
+    #         log.error(f'{href}===数据入库失败')
+    #         state = 0
+    #         takeTime = baseCore.getTimeCost(start_time, time.time())
+    #         baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
+    # except:
+    #     log.error(f'{href}===发送Kafka失败')
+    #     state = 0
+    #     takeTime = baseCore.getTimeCost(start_time, time.time())
+    #     baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    # return 1
+# 数据发送至Kafka
+@retry(tries=3, delay=1)
+def sendKafka(dic_news, start_time):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    kafka_result = producer.send("researchReportTopic",
+                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+    print(kafka_result.get(timeout=10))
+    dic_result = {
+        'success': 'ture',
+        'message': '操作成功',
+        'code': '200',
+    }
+    log.info(dic_result)
+    # 传输成功,写入日志中
+    state = 1
+    takeTime = baseCore.getTimeCost(start_time, time.time())
+    baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
+# 数据保存入库，用于判重
+@retry(tries=3, delay=1)
+def insertMysql(social_code, link):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+    # 动态信息列表
+    list_info = [
+        social_code,
+        link,
+        '新浪财经',
+        '2',
+    ]
+    cursor.execute(insert_sql, tuple(list_info))
+    cnx.commit()
+# 判断动态是否采集过
+@retry(tries=3, delay=1)
+def selectUrl(url, social_code):
+    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
+    cursor.execute(sel_sql, (url, social_code))
+    selects = cursor.fetchone()
+    return selects
+def doJob():
+    # while True:
+    start_time = time.time()
+    # social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
+    social_code = '91330000747735638J'
+    if not social_code or social_code == 'None':
+        time.sleep(20)
+    data = baseCore.getInfomation(social_code)
+    gpdm = data[3]
+    log.info(f'{social_code}==={gpdm}===开始采集')
+    # if gpdm == '' or not gpdm:
+    #     log.error(f'{social_code}===股票代码为空')
+    #     continue
+    gpdm_ = gpdm.split('.')[0]
+    if len(gpdm_) != 5:
+        gpdm_ = gpdm_.zfill(5)
+    page = 1
+    num_ok = 0
+    num_error =0
+    while True:
+        url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
+        soup = getrequests(url)
+        if '拒绝访问' in soup.text:
+            log.error(f'{social_code}===ip封禁')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+            # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
+            time.sleep(1800)
+            break
+        next_flg = soup.find('div',class_='part02').text
+        if '暂无数据' in next_flg:
+            break
+        try:
+            li_list = soup.find('ul', class_='list01').find_all('li')
+            for li in li_list:
+                try:
+                    a = li.find('a')
+                    if a:
+                        title = a.text
+                        if title == '':
+                            continue
+                        href = a.get('href')
+                        selects = selectUrl(href,social_code)
+                        if selects:
+                            log.info(f'{href}===已采集过')
+                            continue
+                        pub_time = format_time(li.find('span').text)
+                        print(title)
+                        flag = getDic(social_code,title,href,pub_time)
+                        if flag == 1:
+                            num_ok += 1
+                        else:
+                            num_error += 1
+                        time.sleep(0.5)
+                except Exception as e:
+                    ee = e.__traceback__.tb_lineno
+                    log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                    continue
+            # 增量使用
+            # if selects:
+            #     break
+        except:
+            log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+        page += 1
+    log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+if __name__ == "__main__":
+    doJob()
+    baseCore.close()