02/05

db863d92 · 薛凌堃 · 7d517080 · db863d92 · db863d92 · db863d92
--- a/comData/BaseInfo_qcc/baseinfo0123.py
+++ b/comData/BaseInfo_qcc/baseinfo0123.py
@@ -582,22 +582,7 @@ def login():
    url = 'https://www.qcc.com'
    driver.get(url)
    driver.maximize_window()
-    # from selenium.webdriver.support import expected_conditions as EC
-    # wait = WebDriverWait(driver, 10)
-    # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
-    # # page_source = browser.page_source
-    # # soup = BeautifulSoup(page_source,'html.parser')
-    # # print(soup)
-    # driver.find_element(By.CLASS_NAME, 'nav-item').click()
    # time.sleep(10)
-    # wait = WebDriverWait(driver, 10)
-    # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change")))
-    # driver.find_element(By.CLASS_NAME, 'login-change').click()
-    # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[1]/div[2]/a').click()
-    # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[1]/input').send_keys('18703752600')
-    # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468')
-    # driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click()
-    # time.sleep(3)
    # cookie_list = driver.get_cookies()
    cookieinfo = token.getToken()
    if cookieinfo:
@@ -607,14 +592,20 @@ def login():
        time.sleep(30)
        return
    id_cookie = cookieinfo[0]
-    cookie_ = json.loads(cookieinfo[1])
-
-    cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
+    cookie_list = json.loads(cookieinfo[1])
+    # cookie_list = json.dumps(cookieinfo[1])
+    print(cookie_list)
+    # cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
    for cookie in cookie_list:
+        cookie['expiry'] = int(cookie['expiry'])
+        # del cookie['expiry']
        driver.add_cookie(cookie)
    time.sleep(5)
+    driver.refresh()
    url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
    driver.get(url_test)
+    # driver.get('https://www.qcc.com/')
+    time.sleep(60)
    return driver,id_cookie


@@ -695,7 +686,7 @@ if __name__ == '__main__':
        # category = ''
        # exchange = ''

-        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
+        # count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
        time.sleep(10)
        # break
        # baseCore.r.close()

--- a/comData/BaseInfo_qcc/requestQCC.py
+++ b/comData/BaseInfo_qcc/requestQCC.py
@@ -34,10 +34,12 @@ def flushAndGetToken():
    cookie_list = browser.get_cookies()
    cookies = {}
    print(cookie_list)
-    for cookie in cookie_list:
-        cookies[cookie['name']] = cookie['value']
-    print(cookies)
-    return cookies
+    # for cookie in cookie_list:
+    #     cookies[cookie['name']] = cookie['value']
+    # print(cookies)
+    # return cookies
+    print(type(cookie_list))
+    return cookie_list

 if __name__ == "__main__":
    urlqcc = 'https://www.qcc.com/'
@@ -51,7 +53,10 @@ if __name__ == "__main__":
    browser.find_element(By.CLASS_NAME, 'nav-item').click()
    time.sleep(20)
    cookies = flushAndGetToken()
-    cookies = json.dumps(cookies)
+    # print(cookies)
+    # cookies = json.dumps(cookies)
+    cookies = str(cookies)
+    # print(cookies)
    insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
    cursor_.execute(insert)
    cnx_.commit()

--- a/comData/Tyc/CorePerson2.py
+++ b/comData/Tyc/CorePerson2.py
--- a/comData/YanBao/get_updateinfo.py
+++ b/comData/YanBao/get_updateinfo.py
+"""
+从es中拿到所有的信息
+"""
+import json
+import threading
+import time
+import uuid
+
+import redis
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+
+# 使用连接池
+# cnx_ = baseCore.pool_11.connection()
+# cursor_ = cnx_.cursor()
+
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+
+lock = threading.Lock()
+pathType = 'QYNotice/'
+taskType = '企业研报/东方财富网'
+
+pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name,pnum):
+       body = {
+
+           "query": {
+                "bool": {
+                  "must": [
+                    {
+                      "match_phrase": {
+                        "content": "Error Times"
+                      }
+                    },
+                    {
+                      "match": {
+                        "type": "3"
+                      }
+                    }
+                  ]
+                }
+              },
+           "track_total_hits": True,
+           "size": 200,
+           "from": pnum
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.attachmentIds',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.labels.relationId',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+
+def main(page, p, esMethod):
+    redis_conn = redis.Redis(connection_pool=pool)
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    total = result['hits']['total']['value']
+    # if total == 0:
+    #     log.info('++++已没有数据+++++')
+    #     return
+    try:
+        msglist = result['hits']['hits']
+    except:
+        log.info(f'error-----{result}')
+        return
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+
+    for mms in msglist:
+        id = mms['_id']
+        sourceAddress = mms['_source']['sourceAddress']
+        att_id = mms['_source']['attachmentIds'][0]
+        social_code= mms['_source']['labels'][0]['relationId']
+        log.info(f'{id}--{att_id}---{sourceAddress}---')
+        item = f'{id}|{att_id}|{sourceAddress}|{social_code}'
+        redis_conn.lrem('Notice:id', 0, item)
+        redis_conn.lpush('Notice:id', item)
+        redis_conn.lrem('Notice:id111', 0, item)
+        redis_conn.lpush('Notice:id111', item)
+
+def run_threads(num_threads,esMethod,j):
+    threads = []
+
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+
+
+if __name__ == "__main__":
+    j = 0
+    for i in range(24):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/comData/YanBao/update_notice.py
+++ b/comData/YanBao/update_notice.py
+"""
+es查询出的数据放到redis中，从redis中取出数据，
+重新下载链接，更新附件表中的文件路径 更新es中的content字段
+"""
+
+import json
+import os
+import threading
+import time
+import uuid
+
+import redis
+import requests
+from bs4 import BeautifulSoup
+from obs import ObsClient
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+import os
+import subprocess
+import uuid
+
+import requests,time, json, sys
+from retry import retry
+
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+
+lock = threading.Lock()
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
+es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+index_name = 'researchreportdata'
+taskType = '公告处理错误数据'
+pathType = 'QYNotice/'
+
+def updateaunn(index_name,id,content):
+    body = {
+        'doc': {
+            'content': [str(content)]
+        }
+    }
+    result = es.update(index=index_name
+                            ,id=id
+                            ,body=body)
+    log.info('更新结果:%s' % result)
+
+#获取文件大小
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units)-1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
+
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
+
+def uptoOBS(pdf_url,type_id,social_code):
+    headers = {}
+    category = os.path.splitext(pdf_url)[1]
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
+               'full_path': '',
+               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    if category == '.pdf':
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            if response.status_code != 200:
+                return retData
+            file_size = int(response.headers.get('Content-Length'))
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+            # todo:判断内容是否成功
+            if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                return retData
+            else:
+                pass
+        except:
+            log.error(f'文件损坏')
+            return retData
+    else:
+        for i in range(0, 3):
+            try:
+                page_size = 1
+                response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
+                if response.status_code != 200:
+                    return retData
+                file_size = int(response.headers.get('Content-Length'))
+                retData['content'] = response.text
+                #todo:判断内容是否成功
+                if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                    return retData
+                else:
+                    break
+            except:
+                time.sleep(3)
+                continue
+    name = str(getuuid()) + category
+    try:
+        result = getOBSres(pathType, name, response)
+    except:
+        log.error(f'OBS发送失败')
+        return retData
+
+    if page_size < 1:
+        # pdf解析失败
+        # print(f'======pdf解析失败=====')
+        return retData
+    else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = convert_size(file_size)
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+        except Exception as e:
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            return retData
+
+        return retData
+
+@retry(tries=3, delay=1)
+def getOBSres(pathType,name, response):
+    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
+    return result
+
+def tableUpdate(retData,att_id):
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    page_size = retData['page_size']
+    try:
+        # Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        Upsql = """update clb_sys_attachment set path= %s, full_path= %s, category=%s,file_size=%s,page_size=%s,object_key=%s,bucket_name=%s where id=%s"""
+        values = (
+            path, full_path, category, file_size, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn', att_id)
+        # lock.acquire()
+        cursor_.execute(Upsql, values)  # 插入
+        cnx_.commit()  # 提交
+        # lock.release()
+    except Exception as e:
+        log.info(e)
+        return False
+    log.info(f"更新完成:{att_id}====")
+    return True
+
+def GetContent(pdf_url, social_code, start_time,att_id):
+    # 上传至华为云服务器
+    retData = uptoOBS(pdf_url, 8, social_code)
+    # 附件插入att数据库
+    if retData['state']:
+        pass
+    else:
+        log.info(f'====pdf解析失败====')
+        # 获取当前进程pid
+        current_pid = baseCore.getPID()
+        # todo: 重新启动新进程，杀死当前进程
+        subprocess.Popen([sys.executable] + sys.argv)
+        os.kill(current_pid, 9)
+        return False
+
+    att_id = tableUpdate(retData, att_id)
+    if att_id:
+        pass
+    else:
+        return False
+    content = retData['content']
+    return content
+
+if __name__ == '__main__':
+    while True:
+        item = '23121403366|1731946331417709349|https://data.eastmoney.com/notices/detail/CG/AN202311081609279378.html|ZZSN230824151229535'
+        # item = baseCore.redicPullData('Notice:id')
+        log.info(item)
+        if item:
+            es_id = item.split('|')[0]
+            att_id = item.split('|')[1]
+            # 原文链接
+            href = item.split('|')[2]
+            social_code = item.split('|')[3]
+            art_code = href.split('/')[-1].split('.')[0]
+            t = int(time.time() * 1000)
+            json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'
+
+            for n1 in range(0, 3):
+                try:
+                    ip = baseCore.get_proxy()
+                    json_2 = requests.get(json_url, proxies=ip, verify=False).json()
+                    break
+                except:
+                    if n1 == 2:
+                        sys.exit(0)
+                    time.sleep(60)
+                    continue
+            try:
+                pdf_url = json_2['data']['attach_url']
+            except:
+                pdf_url = ''
+            if pdf_url:
+                # todo: 下载pdf文件
+                start_time = time.time()
+                content = GetContent(pdf_url, social_code, start_time, att_id)
+                if content:
+                    # 公告信息列表
+                    log.info(f'{att_id}==============解析传输操作成功')
+                    state = 1
+                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
+                    # todo: 更新es
+                    updateaunn(index_name, es_id, content)
+                    time.sleep(2)
+            else:
+                baseCore.rePutIntoR('Noticeerror:id', item)
+        else:
+            log.info('######已无数据######')
+        break
+
+
--- a/comData/dingzhi/europa.py
+++ b/comData/dingzhi/europa.py
+"""
+国外智库-欧盟 经合组织
+"""
+from bs4 import BeautifulSoup
+import requests
+from datetime import datetime
+url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
+headers = {
+'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+'Accept-Encoding': 'gzip, deflate, br',
+'Accept-Language': 'zh-CN,zh;q=0.9',
+'Cache-Control': 'max-age=0',
+'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
+'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
+'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+'Sec-Ch-Ua-Mobile': '?0',
+'Sec-Ch-Ua-Platform': '"Windows"',
+'Sec-Fetch-Dest': 'document',
+'Sec-Fetch-Mode': 'navigate',
+'Sec-Fetch-Site': 'same-origin',
+'Sec-Fetch-User': '?1',
+'Upgrade-Insecure-Requests': '1',
+'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+}
+req = requests.get(url=url, headers=headers)
+soup = BeautifulSoup(req.content, 'html.parser')
+
+div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
+div_list = div_part.find_all('div', class_='row panel')
+for div in div_list[:1]:
+    # div = div_.find_all('div')
+    # print(div)
+    title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
+    href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
+    pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
+    # 定义原始时间的格式
+    time_format = "%d %b %Y"
+    # 转换为标准时间
+    standard_time = datetime.strptime(pubtime_, time_format)
+
+    pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
+    # print(pdf_part)
+    pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
+
+    req_news = requests.get(url=href, headers=headers)
+    soup_news = BeautifulSoup(req_news.content, 'html.parser')
+    print(title, standard_time, pdf_url, href)
+    contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
+    content = contentWithTag.get_text()
+
+
--- a/comData/dingzhi/zzcx.py
+++ b/comData/dingzhi/zzcx.py
@@ -2,10 +2,98 @@
 中证智能财讯
 """
 import json
-
+import sys
+import time
+from obs import ObsClient
+import fitz
 import requests
 from bs4 import BeautifulSoup
+from retry import retry
+from selenium.webdriver.common.by import By
+from selenium import webdriver
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+obsClient = ObsClient(
+    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+)
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'
+
+    # options = webdriver.EdgeOptions()
+    options = {
+        "browserName": "MicrosoftEdge",
+        "ms:edgeOptions": {
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
+        }
+    }
+
+    driver = webdriver.Edge(executable_path=path, capabilities=options)
+    return driver
+
+@retry(tries=3, delay=1)
+def getOBSres(pathType, name, response):
+    result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
+    # result = obsClient.putFile('zzsn', pathType+name, file_path=response)
+    return result
+
+def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
+        headers = {}
+        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
+                   'full_path': '',
+                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
+                   'create_time': '', 'page_size': '', 'content': ''}
+        headers['User-Agent'] = baseCore.getRandomUserAgent()
+        for i in range(0, 3):
+            try:
+                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+                file_size = int(response.headers.get('Content-Length'))
+                break
+            except:
+                time.sleep(3)
+                continue
+        page_size = 0
+        name = str(baseCore.getuuid()) + '.pdf'
+        now_time = time.strftime("%Y-%m")
+        try:
+            result = getOBSres(pathType, now_time, name, response)
+        except:
+            log = baseCore.getLogger()
+            log.error(f'OBS发送失败')
+            return retData
+        try:
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+        except:
+            log = baseCore.getLogger()
+            log.error(f'文件损坏')
+            return retData

+        if page_size < 1:
+            # pdf解析失败
+            # print(f'======pdf解析失败=====')
+            return retData
+        else:
+            try:
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                retData['state'] = True
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
+                retData['full_path'] = result['body']['objectUrl']
+                retData['file_size'] = baseCore.convert_size(file_size)
+                retData['create_time'] = time_now
+                retData['page_size'] = page_size
+            except Exception as e:
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+                return retData
+
+            return retData

 def zzcx():
    url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
@@ -39,13 +127,50 @@ def zzcx():
        for news in records:
            title = news['title']
            news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
+
+            # 使用模拟浏览器打开
+            driver = create_driver()
+            driver.get(news_url)
+            div_ = driver.find_element(By.ID, 'line')
+            div = div_.find_element(By.XPATH, '..')
+            image_data = div.screenshot_as_base64
+            # todo:保存到obs链接及标签替换
+            baseCore.uptoOBS()
+            html = driver.page_source
+
            news_req = requests.get(url=news_url, headers=headers)
            news_soup = BeautifulSoup(news_req.content, 'html.parser')
            detail_info = news_soup.find('div', class_='subTitle___svblj')
            div_list = detail_info.find_all('div')
            origin = div_list[0].text
            publishDate = div_list[1].text
-
-
+            contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
+            content = contentWithTag.text
+            info_code = 'IN-20240129-0001'
+            result_dict = {
+                'id': '',
+                'sid': '1751787750127857666',
+                'title': title,
+                'organ': origin,
+                'origin': '国务院国有资产监督管理委员会',
+                # '摘要': zhaiyao,
+                'source': 16,
+                'content': content,
+                'contentWithTag': contentWithTag,
+                'publishDate': publishDate,
+                'sourceAddress': news_url,
+            }
+            log.info(f'{page}--{title}--{href}')
+            # info_list.append(result_dict)
+            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+            try:
+                kafka_result = producer.send("crawlerInfo",
+                                             json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
+                r.sadd(info_code + '-test', href)
+                log.info('发送kafka成功！')
+            except Exception as e:
+                log.info(e)
+            finally:
+                producer.close()
 if __name__ == "__main__":
    zzcx()
\ No newline at end of file
--- a/test.py
+++ b/test.py
@@ -151,7 +151,7 @@ log = baseCore.getLogger()
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
    '数据源_0504']

-datas = db_storage.find({'postCode':'2'}).limit(5)
+datas = db_storage.find({'postCode':'2'}).limit(1)
 for data in datas:
    title = data['titleForeign']
    contentWithTag = data['richTextForeign']
@@ -170,5 +170,5 @@ for data in datas:
    #     f.write(dic_info_)
    # break
    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
-    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
+    req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
    log.info(req.text)
\ No newline at end of file