11.29

3ad4b1e5 · 薛凌堃 · 5d788bc9 · 3ad4b1e5 · 3ad4b1e5 · 3ad4b1e5
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
--- a/comData/YanBao/deletebyid.py
+++ b/comData/YanBao/deletebyid.py
@@ -32,7 +32,7 @@ class EsMethod(object):
    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'],  http_auth=('elastic', 'zzsn9988'),timeout=300 )
-        self.index_name='researchreportdata'
+        self.index_name='policy'
    '''
    删除
@@ -52,7 +52,10 @@ if __name__ == "__main__":
        if item:
            log.info(item)
            id = item.decode()
-            esMethod.delete(esMethod.index_name,id)
+            try:
+                esMethod.delete(esMethod.index_name,id)
+            except:
+                continue
        else:
            log.info('已删除完毕')
            break

--- a/comData/YanBao/get_doc_id.py
+++ b/comData/YanBao/get_doc_id.py
+import json
+import threading
+import time
+import uuid
+import redis
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+# 使用连接池
+# cnx_ = baseCore.pool_11.connection()
+# cursor_ = cnx_.cursor()
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+lock = threading.Lock()
+pathType = 'QYNotice/'
+taskType = '企业研报/东方财富网'
+pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+class EsMethod(object):
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'policy'
+    def queryatt(self,index_name,pnum):
+       body = {
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "term": {
+                               "sid.keyword": {
+                                   "value": "1697458829758697473"
+                               }
+                           }
+                       },
+                       {
+                           "range": {
+                               "createDate": {
+                                   "gte": "2023-11-28T10:00:00",
+                                   "lte": "2023-11-29T10:00:00"
+                               }
+                           }
+                       }
+                   ]
+               }
+           },
+           "track_total_hits": True,
+           "size": 200,
+           "from": pnum
+       }
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.createDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+def main(page, p, esMethod):
+    redis_conn = redis.Redis(connection_pool=pool)
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    total = result['hits']['total']['value']
+    if total == 0:
+        log.info('++++已没有数据+++++')
+        return
+    msglist = result['hits']['hits']
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+    for mms in msglist:
+        id = mms['_id']
+        title = mms['_source']['title']
+        sourceAddress = mms['_source']['sourceAddress']
+        log.info(f'{id}--{title}--{sourceAddress}---')
+        if redis_conn.lrem('YanBao:id', 0, id) == 0:
+            redis_conn.lpush('YanBao:id', id)
+        else:
+            continue
+def run_threads(num_threads,esMethod,j):
+    threads = []
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+        threads.append(thread)
+        thread.start()
+    for thread in threads:
+        thread.join()
+if __name__ == "__main__":
+    j = 0
+    for i in range(5):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/comData/annualReport1023/updateyear.py
+++ b/comData/annualReport1023/updateyear.py
@@ -84,14 +84,15 @@ if __name__ == "__main__":
        id_ = redis_conn.lpop('YanBao:up')
        # id = "23112104300"
-        if id:
+        if id_:
            pass
        else:
            log.info('已无数据')
+            break
        id = id_.decode()
        result_ = esMethod.queryatt(index_name=esMethod.index_name, id=id)
        result = result_['hits']['hits'][0]
        num = 0
        publishDate = result['_source']['publishDate']
-        u_publishDate = '2023-08-31' #+ publishDate.split('T')[1]
+        u_publishDate = '2022-12-31' #+ publishDate.split('T')[1]
        esMethod.updateaunn(esMethod.index_name, str(id), u_publishDate)
--- a/comData/noticeReport/东方财富网-港股公告.py
+++ b/comData/noticeReport/东方财富网-港股公告.py
 import os
@@ -323,7 +323,10 @@ def spider(browser, code, social_code, com_name):
        # span_tag = browser.find_element(By.CLASS_NAME,'mbox')
        span_tag = browser.find_element(By.XPATH, '//div[@class="mbox"]/span[2]')
        current_page = int(span_tag.text)
-        totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-1].text)
+        try:
+            totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-1].text)
+        except:
+            totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-2].text)
        if current_page < totalpage:
            # 说明还未到最后一页
            span_tag.find_element(By.XPATH, './following-sibling::a[1]').click()

--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -508,7 +508,7 @@ class BaseCore:
            except:
                time.sleep(3)
                continue
-        page_size = 0
+        # page_size = 0
        for i in range(0, 3):
            try:
                # name = file_name
@@ -522,23 +522,23 @@ class BaseCore:
                time.sleep(3)
                continue
-        if page_size < 1:
+        # if page_size < 1:
-            # pdf解析失败
+        #     # pdf解析失败
-            # print(f'======pdf解析失败=====')
+        #     # print(f'======pdf解析失败=====')
+        #     return retData
+        # else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = self.convert_size(file_size)
+            retData['create_time'] = time_now
+        except Exception as e:
+            print(f'error:{e}')
            return retData
-        else:
-            try:
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                retData['state'] = True
-                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = unquote(result['body']['objectUrl'])
-                retData['file_size'] = self.convert_size(file_size)
-                retData['create_time'] = time_now
-            except Exception as e:
-                print(f'error:{e}')
-                return retData
-            return retData
+        return retData

--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py