es操作

2aca66c0 · 薛凌堃 · 61eecac3 · 2aca66c0 · 2aca66c0 · 2aca66c0
--- a/comData/YanBao/deletebyid.py
+++ b/comData/YanBao/deletebyid.py
@@ -22,8 +22,7 @@ cnx_ = baseCore.cnx_
 cursor_ = cnx_.cursor()

 lock = threading.Lock()
-pathType_ = 'QYResearchReport/'
-taskType = '企业研报/东方财富网'
+

 pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)


--- a/comData/annualReport1023/es_mysql.py
+++ b/comData/annualReport1023/es_mysql.py
-
-
+import pandas as pd
 import urllib3
 import BaseCore
 from elasticsearch import Elasticsearch
@@ -11,20 +10,25 @@ es = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('el
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
 r = baseCore.r
-cnx = baseCore.cnx
-cursor = baseCore.cursor
+
 cnx_11 = baseCore.cnx_
 cursor_11 = baseCore.cursor_

 def getList():
-    sql = 'Select id,item_id,year from clb_sys_attachment where type_id = 1'
-    cursor_11.execute(sql)
-    datas = cursor_11.fetchall()
-
-    gg_social_list = [str(item[1]) + "|" + str(item[0]) + "|" + str(item[2]) for item in datas]
-    print('=======')
-    for item in gg_social_list:
-        r.rpush("NianBao:info", item)
+    df = pd.read_excel('./中国500强榜单年报.xlsx', sheet_name='Sheet1')
+    for i in range(len(df)):
+        social_code = df['企业信用代码'][i]
+        year = df['年报年份'][i]
+        sql = 'Select id,item_id,year from clb_sys_attachment where type_id = 1 and item_id = %s and year = %s'
+        cursor_11.execute(sql,(social_code, year))
+        datas = cursor_11.fetchall()
+        if datas:
+            gg_social_list = [str(item[1]) + "|" + str(item[0]) + "|" + str(item[2]) for item in datas]
+            print('=======')
+            for item in gg_social_list:
+                r.rpush("NianBao:info", item)
+        else:
+            log.info(f'{social_code}|{year}======不存在')

 if __name__ == "__main__":
    getList()
--- a/comData/annualReport1023/uptoes.py
+++ b/comData/annualReport1023/uptoes.py
-"""
-年报上传到es
-content 需要重新解析
-lang语言
-origin 来源 从数据库中获取
-title 从数据库中获取
-dic_info = {
-                'attachmentIds': att_id,
-                'author': '',
-                'content': content,
-                'contentWithTag': '',
-                'createDate': time_now,
-                'deleteFlag': '0',
-                'id': '',
-                'keyWords': '',
-                'lang': lang,
-                'origin': origin,
-                'publishDate': datetime_string,
-                'sid': '1684032033495392257',
-                'sourceAddress': year_url,  # 原文链接
-                'summary': '',
-                'title': name_pdf.replace('.pdf', ''),
-                'type': 1,
-                'socialCreditCode': social_code,
-                'year': year
-}
-"""
+
 # 1.无年份和信用代码 另外存一个redis的key中
 # 2.有信用代码 id 年份的 （1）es 中的id 需要更新为附件表中的id
 # （2）通过信用代码 查出名称相同的个数 如果有两个的话，说明其中有一个没有在es库中 需要把es库中的id获取到，并删除没有在es库中的那个记录
@@ -32,11 +6,14 @@ dic_info = {


 #todo:查出有一条记录的，先更新 其他的先保存到另一个redis中
+import json
 import threading

 import redis
 import requests, re, time, pymysql,  fitz
 import urllib3
+from kafka import KafkaProducer
+
 from base import BaseCore

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -51,45 +28,139 @@ lock = threading.Lock()
 taskType = '企业年报'
 pathType = 'QYYearReport/'

-
-def secrchATT(type_id, xydm):
-
-    sel_sql = '''select * from clb_sys_attachment where item_id=%s '''
+def sendKafka(dic_news):
+    try:  # 114.116.116.241
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
+        kafka_result = producer.send("researchReportTopic",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+        print(kafka_result.get(timeout=10))
+
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        log.info(dic_result)
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        log.info(dic_result)
+        return False
+
+def getContent(file_href):
+    headers = {}
+    content = ''
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    for i in range(0, 3):
+        try:
+            response = requests.get(file_href, headers=headers, verify=False, timeout=20)
+            break
+        except:
+            time.sleep(3)
+            continue
+
+    with fitz.open(stream=response.content, filetype='pdf') as doc:
+        page_size = doc.page_count
+        log.info(f'当前页码----{page_size}')
+        for page in doc.pages():
+            content += page.get_text()
+    return content
+
+def secrchATT(type_id, xydm, year):
+
+    sel_sql = '''select * from clb_sys_attachment where item_id=%s and type_id=%s and year=%s'''
    lock.acquire()
-    cursor_.execute(sel_sql, (type_id, xydm))
+    cursor_.execute(sel_sql, (xydm, type_id, year))
    selects = cursor_.fetchall()
    lock.release()
    return selects

+def selectShortName(xydm):
+    sel_sql = "select * from sys_base_enterprise where social_credit_code = %s"
+    lock.acquire()
+    cursor_.execute(sel_sql, xydm)
+    selects = cursor_.fetchone()
+    lock.release()
+    return selects
+
 def main():
    redis_conn = redis.Redis(connection_pool=pool)
-    info_ = redis_conn.lpop("NianBao:id")
+
+    # info_ = redis_conn.lpop("NoIPO:info")
+    info_ = '91310000132206289R|1725799077425945040|2022'
    if info_:
        pass
    else:
        log.info("++++已没有数据++++")

        return
-    info = info_.decode()
+    # info = info_.decode()
+    info = info_
    xydm = info.split('|')[0]
    att_id = info.split('|')[1]
    year = info.split('|')[2]
    if not xydm or not year:
-        redis_conn.lpush('NianBao:info', info)
+        redis_conn.lpush('info', info)
    else:
-        selects = secrchATT(1, xydm)
+        selects = secrchATT('1', xydm, year)
        if len(selects) > 1:
            redis_conn.lpush('NianBao:info', info)
        elif len(selects) == 1:
-            file_name = selects[1]
-            origin = selects[18]
-            create_time = selects[13]
-            publishDate = selects[21]
-
-            file_href = 'http://zzsn.luyuen.com' + str(selects[5])
-
-
-
+            # results = selectShortName(xydm)
+            # if results:
+            #     pass
+            # else:
+            #     redis_conn.lpush('NoIPO:info', info)
+            #     return
+            select = selects[0]
+            # name = results[3]
+            name = select[1]
+            if name:
+                # file_name = results[3] + ':' + year + '年年度报告'
+                file_name = name.split('.')[0]
+            else:
+                redis_conn.lpush('Noname:info', info)
+                return
+            log.info(f'-----------{file_name}-----------')
+            origin = select[18]
+            create_time = select[13]
+            publishDate = select[21]
+            if publishDate == '2023-12-31':
+                publishDate = '2023-08-31'
+            file_href = 'http://zzsn.luyuen.com' + str(select[5])
+            content = getContent(file_href)
+
+            lang = baseCore.detect_language(content)
+            if lang == 'cn':
+                lang = 'zh'
+            dic_info = {
+                'attachmentIds': att_id,
+                'author': '',
+                'content': content,
+                'contentWithTag': '',
+                'createDate': str(create_time),
+                'deleteFlag': '0',
+                'id': '',
+                'keyWords': '',
+                'lang': lang,
+                'origin': origin,
+                'publishDate': publishDate,
+                'sid': '1684032033495392257',
+                'sourceAddress': '',  # 原文链接
+                'summary': '',
+                'title': file_name,
+                'type': 1,
+                'socialCreditCode': xydm,
+                'year': year
+            }
+            sendKafka(dic_info)
+            time.sleep(1)

 def run_threads(num_threads):
    threads = []
@@ -105,8 +176,9 @@ def run_threads(num_threads):
        thread.join()

 if __name__ == "__main__":
-    start = time.time()
-    num_threads = 5
-    run_threads(num_threads)
+    while True:
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads)

-    log.info(f'5线程  总耗时{time.time() - start}秒')
\ No newline at end of file
+        log.info(f'5线程  总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/estool/Esmethod.py
+++ b/estool/Esmethod.py
@@ -162,11 +162,11 @@ class EsMethod(object):
            'query': {
                'bool': {
                    'should':[
-                        # {'term':{'origin' : '雪球网'}},
-                        {'term':{'type' : 1}},
+                        # {'term':{'origin': '证监会'}},
+                        {'term':{'type': 1}},
                    ],
                    'must': [
-                        {'match': {'title': '.pdf'}}
+                        {'match': {'title': '.PDF'}}
                    ]
                }
            },
@@ -221,11 +221,11 @@ if __name__ == '__main__':
        for mms in msglist:
            id=mms['_source']['id']
            title=mms['_source']['title']
-            utitle=title.replace('.pdf','')
+            utitle=title.replace('.PDF','')
            print(f'id:{id}---title:{title}--utitle:{utitle}')
            esMethod.updateaunn(esMethod.index_name,str(id),utitle)
            # esMethod.delete(esMethod.index_name,str(id))
-            print('跟新成功！！')
+            # print('更新成功！！')