11/10

3ad4e1eb · 薛凌堃 · ce4c997a · 3ad4e1eb · 3ad4e1eb · 3ad4e1eb
--- a/REITs专题数据/BaseCore.py
+++ b/REITs专题数据/BaseCore.py
-# 核心工具包
+# 核心工具包
@@ -524,7 +524,7 @@ class BaseCore:
                if category in file_name:
                    pass
                else:
-                    file_name = file_name + '.' + category
+                    file_name = file_name + category
                result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
                break
            except:

--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -564,13 +564,13 @@ if __name__ == "__main__":
    # kegaishifan()
    # shuangbaiqiye()
    # zhuangjingtexind()
-    NoticeEnterprise()
+    # NoticeEnterprise()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()
    # BaseInfoEnterprise()
    # BaseInfoEnterpriseAbroad()
    # NewsEnterprise_task()
-    # NewsEnterprise()
+    NewsEnterprise()
    # CorPerson()
    # china100()
    # global100()
@@ -585,8 +585,8 @@ if __name__ == "__main__":
    # dujioashou()
    # omeng()
    # AnnualEnterpriseUS()
-    NoticeEnterprise_task()
-    AnnualEnterprise_task()
+    # NoticeEnterprise_task()
+    # AnnualEnterprise_task()
    # FinanceFromEast()
    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')

--- a/comData/BaseInfo_qcc/requestQCC.py
+++ b/comData/BaseInfo_qcc/requestQCC.py
+"""模拟扫码登录"""
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+def createDriver():
+    chrome_driver = r'D:\cmd100\chromedriver.exe'
+    path = Service(chrome_driver)
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+    # 设置代理
+    # proxy = "127.0.0.1:8080"  # 代理地址和端口
+    # chrome_options.add_argument('--proxy-server=http://' + proxy)
+    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
+    return driver
+
+def flushAndGetToken():
+    log.info('======刷新浏览器=====')
+    browser.refresh()
+    cookie_list = browser.get_cookies()
+    cookies = {}
+    for cookie in cookie_list:
+        cookies[cookie['name']] = cookie['value']
+    print(cookies)
+
+def getrequest_soup(headers,url):
+    req = requests.get(headers=headers,url=url)
+    result = BeautifulSoup(req.content,'html.parser')
+    return result
+
+def dojob():
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=1d489139eea4830a062c3a1240; acw_tc=db9062ad16994955552435350e3b43e7e5cee64c77d9f807936897ab1f',
+        'Host': 'www.qcc.com',
+        'Referer': 'https://www.qcc.com/',
+        'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+    }
+    url = 'https://www.qcc.com/web/search?key=%E5%B0%8F%E7%B1%B3%E9%80%9A%E8%AE%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8'
+    soup = getrequest_soup(headers,url)
+
+    pass
+
+if __name__ == "__main__":
+    urlqcc = 'https://www.qcc.com/'
+    browser = createDriver()
+    browser.get(urlqcc)
+    wait = WebDriverWait(browser, 10)
+    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
+    # page_source = browser.page_source
+    # soup = BeautifulSoup(page_source,'html.parser')
+    # print(soup)
+    browser.find_element(By.CLASS_NAME, 'nav-item').click()
+    time.sleep(20)
+    flushAndGetToken()
+
+
+
+
+
--- a/comData/Tyc/newsbucai.py
+++ b/comData/Tyc/newsbucai.py
 #企业动态 从redis中获取数据
 import json
+import os
 import random
+import subprocess

 import requests, time, pymysql
 import jieba
@@ -50,7 +52,7 @@ headers = {
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor

-taskType = '企业动态/天眼查/补采20W+'
+taskType = '企业动态/天眼查/补采专精特新'

 def reqDetailmsg(url,headers):

@@ -76,7 +78,7 @@ def beinWork(tyc_code, social_code,start_time):
    t = time.time()
    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
    try:
-        for m in range(0, 3):
+        for m in range(0,3):
            ip = baseCore.get_proxy()
            headers['User-Agent'] = baseCore.getRandomUserAgent()
            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
@@ -85,11 +87,18 @@ def beinWork(tyc_code, social_code,start_time):
        if (response.status_code == 200):
            pass
    except Exception as e:
+        #todo:重新放入redis中
+        baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode',social_code)
        log.error(f"{tyc_code}-----获取总数接口失败")
        error = '获取总数接口失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
+        #获取当前进程pid
+        current_pid = baseCore.getPID()
+        #todo: 重新启动新进程，杀死当前进程
+        subprocess.Popen([sys.executable] + sys.argv)
+        os.kill(current_pid,9)
        return retData
    try:
        json_1 = json.loads(response.content.decode('utf-8'))
@@ -126,7 +135,7 @@ def beinWork(tyc_code, social_code,start_time):
                ip = baseCore.get_proxy()
                headers['User-Agent'] = baseCore.getRandomUserAgent()
                response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
-                time.sleep(1)
+                # time.sleep(3)
                break
            except:
                pass
@@ -172,43 +181,25 @@ def beinWork(tyc_code, social_code,start_time):
                retData['up_okCount'] = up_okCount
                retData['up_errorCount'] = up_errorCount
                retData['up_repetCount'] = up_repetCount
-                #return retData
-                continue
+                return retData
            try:
                time_struct = time.localtime(int(info_page['rtm'] / 1000))  # 首先把时间戳转换为结构化时间
                time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct)  # 把结构化时间转换为格式化时间
            except:
                time_format = baseCore.getNowTime(1)
-            #记录时间 对比时间
-            #if time_format > '2023-09-25' and time_format < '2023-10-01':
-                #pass
-            #else:
-                #continue
            try:
                # 开始进行智能解析
                # lang = baseCore.detect_language(title)
                # smart = smart_extractor.SmartExtractor(lang)
-                # req = requests.get(url=link,headers=headers,timeout=10)
-                # html = BeautifulSoup(req.content,'html.parser')
-                raw_html = reqDetailmsg(link,headers)
-                if raw_html:
-
-                    # soup = BeautifulSoup(raw_html, 'html.parser')
-                    try:
-                        article = smart.extract_by_html(raw_html)
-                        content = article.cleaned_text
-                        contentText = article.text
-                    except Exception as e:
-                        log.info(f'抽取失败！！{e}')
-
-                # #带标签正文
-                # contentText = smart.extract_by_url(link).text
-                # #不带标签正文
-                # content = smart.extract_by_url(link).cleaned_text
-                # # time.sleep(3)
+                #带标签正文
+                contentText = smart.extract_by_url(link).text
+                #不带标签正文
+                content = smart.extract_by_url(link).cleaned_text
+                if len(content) < 300:
+                    continue
+                # time.sleep(3)
            except Exception as e:
                contentText = ''
-
            if contentText == '':
                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
                e = '获取正文失败'
@@ -253,7 +244,7 @@ def beinWork(tyc_code, social_code,start_time):
                    'lang': 'zh',
                    'origin': source,
                    'publishDate': time_format,
-                    #'sid': '1684032033495392257',
+                    # 'sid': '1684032033495392257',
                    'sid': '1714852232679067649',
                    'sourceAddress': link,  # 原文链接
                    'summary': info_page['abstracts'],
@@ -286,7 +277,7 @@ def beinWork(tyc_code, social_code,start_time):
                # 传输成功,写入日志中
                state = 1
                takeTime = baseCore.getTimeCost(start_time, time.time())
-                baseCore.recordLog(social_code, taskType, state, takeTime, link, '成功')
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
                # return True
            except Exception as e:
                dic_result = {
@@ -312,7 +303,7 @@ def doJob():
    while True:
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
-        # social_code = '913205007764477744'
+        # social_code = '912301001275921118'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)
@@ -376,16 +367,12 @@ def doJob():
            takeTime = baseCore.getTimeCost(start, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
            time.sleep(5)
-        #break

    cursor.close()
    cnx.close()
    # 释放资源
    baseCore.close()

-# Press the green button in the gutter to run the script.
-
 if __name__ == '__main__':
-
+    log.info(f'当前进程id为{baseCore.getPID()}')
    doJob()
-
--- a/comData/Tyc/newsbucai1.py
+++ b/comData/Tyc/newsbucai1.py
--- a/comData/noticeReport/公告补采1.py
+++ b/comData/noticeReport/公告补采1.py
+"""
+"""
+Elasticsearch 安装
+pip install elasticsearch==7.8.1 版本的
+使用时参考文章
+https://blog.csdn.net/yangbisheng1121/article/details/128528112
+https://blog.csdn.net/qiuweifan/article/details/128610083
+"""
+import json
+import time
+import uuid
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+
+cnx_ = baseCore.cnx_
+cursor_ = baseCore.cursor_
+pathType = 'QYNotice/'
+taskType = '企业公告/证监会'
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name):
+       body = {
+           "_source": ["attachmentIds", "createDate", "sourceAddress", "labels.relationId", "title", "year",
+                       "publishDate", "createDate"],
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "match": {
+                               "type": "3"
+                           }
+                       },
+                       {
+                           "wildcard": {
+                               "attachmentIds.keyword": "911*"
+                           }
+                       }
+                   ]
+               }
+           },
+           "sort": [
+               {
+                   "createDate": {
+                       "order": "asc"
+                   }
+               }
+           ],
+           "track_total_hits": True,
+           "size": 200
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.attachmentIds',  # 字段1
+                      'hits.hits._source.title',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.createDate',
+                      'hits.hits._source.labels.relationId',
+                      'hits.hits._source.publishDate',
+                      'hits.hits._source.year',
+                      'hits.hits._source.createDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       log.info(result)
+       return result
+
+    def updateaunn(self,index_name,id,u_attid):
+        body = {
+            'doc': {
+                'attachmentIds': [str(u_attid)]
+            }
+        }
+        result = self.es.update(index=index_name
+                                ,id=id
+                                ,body=body)
+        log.info('更新结果:%s' % result)
+
+
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
+
+#获取文件大小
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units)-1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
+
+def uptoOBS(pdf_url,pdf_name,type_id,social_code):
+    start_time = time.time()
+    headers = {}
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
+               'full_path': '',
+               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    for i in range(0, 3):
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            file_size = int(response.headers.get('Content-Length'))
+            break
+        except:
+            time.sleep(3)
+            continue
+    page_size = 0
+    name = str(getuuid()) + '.pdf'
+    now_time = time.strftime("%Y-%m")
+    try:
+        result = getOBSres(pathType, name, response)
+    except:
+        log.error(f'OBS发送失败')
+        return retData
+    try:
+        with fitz.open(stream=response.content, filetype='pdf') as doc:
+            page_size = doc.page_count
+            for page in doc.pages():
+                retData['content'] += page.get_text()
+    except:
+        log.error(f'文件损坏')
+        return retData
+
+    if page_size < 1:
+        # pdf解析失败
+        # print(f'======pdf解析失败=====')
+        return retData
+    else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = result['body']['objectUrl']
+            retData['file_size'] = convert_size(file_size)
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+        except Exception as e:
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            return retData
+
+        return retData
+
+@retry(tries=3, delay=1)
+def getOBSres(pathType,name, response):
+    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
+    return result
+
+def secrchATT(item_id, retData, type_id,order_by):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
+    cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
+    selects = cursor_.fetchone()
+    return selects
+
+# 插入到att表 返回附件id
+def tableUpdate(retData, year, pdf_name, num,pub_time,origin):
+    item_id = retData['item_id']
+    type_id = retData['type_id']
+    group_name = retData['group_name']
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    status = retData['status']
+    create_by = retData['create_by']
+    page_size = retData['page_size']
+    create_time = retData['create_time']
+    order_by = num
+    # selects = secrchATT(item_id, pdf_name, type_id)
+    #
+    # if selects:
+    #     log.info(f'pdf_name:{pdf_name}已存在')
+    #     id = ''
+    #     return id
+    # else:
+    try:
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+
+        values = (
+            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+            status, create_by,
+            create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
+            pub_time, origin)
+        cursor_.execute(Upsql, values)  # 插入
+        cnx_.commit()  # 提交
+    except Exception as e:
+        log.info(e)
+    log.info(f"更新完成:{item_id}===={pdf_name}")
+    selects = secrchATT(item_id, retData, type_id,order_by)
+    id = selects[0]
+    return id
+
+def upload(sourceAddress,num):
+    # todo:链接上传obs
+    retData = uptoOBS(sourceAddress, title + '.pdf', 8, social_code)
+    # 附件插入att数据库
+    if retData['state']:
+        pass
+    else:
+        log.info(f'====pdf解析失败====')
+        return None
+    num = num + 1
+    origin = '证监会'
+    att_id = tableUpdate(retData, year, title + '.pdf', num, publishDate, origin)
+    if att_id:
+        return att_id
+    else:
+        return None
+
+if __name__ == '__main__':
+    esMethod = EsMethod()
+    # esMethod.getFileds(index_name=esMethod.index_name)
+    page = 1
+    while True:
+        result = esMethod.queryatt(index_name=esMethod.index_name)
+        total = result['hits']['total']['value']
+        if total==0:
+            log.info('++++已没有数据+++++')
+            break
+        msglist = result['hits']['hits']
+        log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+        # print(msglist)
+        num = 0
+        for mms in msglist:
+            id = mms['_id']
+            title = mms['_source']['title']
+            sourceAddress = mms['_source']['sourceAddress']
+            social_code = mms['_source']['labels'][0]['relationId']
+            year = mms['_source']['year']
+            publishDate = mms['_source']['publishDate']
+            createDate = mms['_source']['createDate']
+            log.info(f'{id}---{title}--{sourceAddress}---{social_code}')
+            att_id = upload(sourceAddress,num)
+            u_attid = att_id
+            esMethod.updateaunn(esMethod.index_name, str(id), u_attid)
+        page+=1
+
+
+
+    #     # esMethod.delete(esMethod.index_name,str(id))
+    #     print('跟新成功！！')
+
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/noticeReport/公告补采2.py
+++ b/comData/noticeReport/公告补采2.py
+"""
+"""
+Elasticsearch 安装
+pip install elasticsearch==7.8.1 版本的
+使用时参考文章
+https://blog.csdn.net/yangbisheng1121/article/details/128528112
+https://blog.csdn.net/qiuweifan/article/details/128610083
+"""
+import json
+import time
+import uuid
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+
+cnx_ = baseCore.cnx_
+cursor_ = baseCore.cursor_
+pathType = 'QYNotice/'
+taskType = '企业公告/证监会'
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name):
+       body = {
+           "_source": ["attachmentIds", "createDate", "sourceAddress", "labels.relationId", "title", "year",
+                       "publishDate", "createDate"],
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "match": {
+                               "type": "3"
+                           }
+                       },
+                       {
+                           "wildcard": {
+                               "attachmentIds.keyword": "None"
+                           }
+                       }
+                   ]
+               }
+           },
+           "sort": [
+               {
+                   "createDate": {
+                       "order": "desc"
+                   }
+               }
+           ],
+           "track_total_hits": True,
+           "size": 200
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.attachmentIds',  # 字段1
+                      'hits.hits._source.title',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.createDate',
+                      'hits.hits._source.labels.relationId',
+                      'hits.hits._source.publishDate',
+                      'hits.hits._source.year',
+                      'hits.hits._source.createDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       log.info(result)
+       return result
+
+    def updateaunn(self,index_name,id,u_attid):
+        body = {
+            'doc': {
+                'attachmentIds': [str(u_attid)]
+            }
+        }
+        result = self.es.update(index=index_name
+                                ,id=id
+                                ,body=body)
+        log.info('更新结果:%s' % result)
+
+
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
+
+#获取文件大小
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units)-1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
+
+def uptoOBS(pdf_url,pdf_name,type_id,social_code):
+    start_time = time.time()
+    headers = {}
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
+               'full_path': '',
+               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    for i in range(0, 3):
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            file_size = int(response.headers.get('Content-Length'))
+            break
+        except:
+            time.sleep(3)
+            continue
+    page_size = 0
+    name = str(getuuid()) + '.pdf'
+    now_time = time.strftime("%Y-%m")
+    try:
+        result = getOBSres(pathType, name, response)
+    except:
+        log.error(f'OBS发送失败')
+        return retData
+    try:
+        with fitz.open(stream=response.content, filetype='pdf') as doc:
+            page_size = doc.page_count
+            for page in doc.pages():
+                retData['content'] += page.get_text()
+    except:
+        log.error(f'文件损坏')
+        return retData
+
+    if page_size < 1:
+        # pdf解析失败
+        # print(f'======pdf解析失败=====')
+        return retData
+    else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = result['body']['objectUrl']
+            retData['file_size'] = convert_size(file_size)
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+        except Exception as e:
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            return retData
+
+        return retData
+
+@retry(tries=3, delay=1)
+def getOBSres(pathType,name, response):
+    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
+    return result
+
+def secrchATT(item_id, retData, type_id,order_by):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
+    cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
+    selects = cursor_.fetchone()
+    return selects
+
+# 插入到att表 返回附件id
+def tableUpdate(retData, year, pdf_name, num,pub_time,origin):
+    item_id = retData['item_id']
+    type_id = retData['type_id']
+    group_name = retData['group_name']
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    status = retData['status']
+    create_by = retData['create_by']
+    page_size = retData['page_size']
+    create_time = retData['create_time']
+    order_by = num
+    # selects = secrchATT(item_id, pdf_name, type_id)
+    #
+    # if selects:
+    #     log.info(f'pdf_name:{pdf_name}已存在')
+    #     id = ''
+    #     return id
+    # else:
+    try:
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+
+        values = (
+            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+            status, create_by,
+            create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
+            pub_time, origin)
+        cursor_.execute(Upsql, values)  # 插入
+        cnx_.commit()  # 提交
+    except Exception as e:
+        log.info(e)
+    log.info(f"更新完成:{item_id}===={pdf_name}")
+    selects = secrchATT(item_id, retData, type_id,order_by)
+    id = selects[0]
+    return id
+
+def upload(sourceAddress,num):
+    # todo:链接上传obs
+    retData = uptoOBS(sourceAddress, title + '.pdf', 8, social_code)
+    # 附件插入att数据库
+    if retData['state']:
+        pass
+    else:
+        log.info(f'====pdf解析失败====')
+        return None
+    num = num + 1
+    origin = '证监会'
+    att_id = tableUpdate(retData, year, title + '.pdf', num, publishDate, origin)
+    if att_id:
+        return att_id
+    else:
+        return None
+
+if __name__ == '__main__':
+    esMethod = EsMethod()
+    # esMethod.getFileds(index_name=esMethod.index_name)
+    page = 1
+    while True:
+        result = esMethod.queryatt(index_name=esMethod.index_name)
+        total = result['hits']['total']['value']
+        if total==0:
+            log.info('++++已没有数据+++++')
+            break
+        msglist = result['hits']['hits']
+        log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+        # print(msglist)
+        num = 0
+        for mms in msglist:
+            id = mms['_id']
+            title = mms['_source']['title']
+            sourceAddress = mms['_source']['sourceAddress']
+            social_code = mms['_source']['labels'][0]['relationId']
+            year = mms['_source']['year']
+            publishDate = mms['_source']['publishDate']
+            createDate = mms['_source']['createDate']
+            log.info(f'{id}---{title}--{sourceAddress}---{social_code}')
+            att_id = upload(sourceAddress,num)
+            u_attid = att_id
+            esMethod.updateaunn(esMethod.index_name, str(id), u_attid)
+        page+=1
+
+
+
+    #     # esMethod.delete(esMethod.index_name,str(id))
+    #     print('跟新成功！！')
+
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -313,7 +313,7 @@ def get_content1():
    end_time = time.time()
    log.info(f'共抓取国务院文件{num}条数据，共耗时{end_time-start_time}')

-# 国务院部门文件
+# 国务院部委文件
 def get_content2():
    pathType = 'policy/gwybmwj/'
    def getTotalpage(bmfl,headers,session):