12/13

b931eea2 · 薛凌堃 · 1f595f59 · b931eea2 · b931eea2 · b931eea2
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -74,7 +74,7 @@ def NewsEnterprise():
    print('=======')
    #将数据插入到redis中
    for item in gn_social_list:
-        r.rpush('NewsEnterprise:gnqy_socialCode', item)
+        r.rpush('NewsResend:newsInfo', item)
        # r.rpush('NewsEnterprise:gnqybc_socialCode', item)
    # for item in gw_social_list:
@@ -126,30 +126,69 @@ def NoticeEnterprise_task():
 def NoticeDF():
    cnx, cursor = connectSql()
    # 获取美股企业
-    # # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    om_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
-    # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    cursor.execute(om_mg_query)
-    # cursor.execute(mg_query)
+    cnx.commit()
-    # cnx.commit()
+    om_mg_result = cursor.fetchall()
-    # mg_result = cursor.fetchall()
+    om_mg_social_list = [item[0] for item in om_mg_result]
-    # mg_social_list = [item[0] for item in mg_result]
+    print('欧盟美股企业=======')
-    # print('=======')
+    for item in om_mg_social_list:
-    # for item in mg_social_list:
+        if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
-    #     if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
+            r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
-    #         r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
+        else:
-    #     else:
+            continue
-    #         continue
-    #     # r.rpush('NoticeEnterprise:mgqy_socialCode_add', item)
+    fq_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    cursor.execute(fq_mg_query)
+    cnx.commit()
+    fq_mg_result = cursor.fetchall()
+    fq_mg_social_list = [item[0] for item in fq_mg_result]
+    print('500强美股企业=======')
+    for item in fq_mg_social_list:
+        if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
+            r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
+        else:
+            continue
+    fbs_mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=3 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    cursor.execute(fbs_mg_query)
+    cnx.commit()
+    fbs_mg_result = cursor.fetchall()
+    fbs_mg_social_list = [item[0] for item in fbs_mg_result]
+    print('福布斯美股企业=======')
+    for item in fbs_mg_social_list:
+        if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
+            r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
+        else:
+            continue
    # 获取港股企业
-    gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 And SecuritiesCode like '%.HK'"
+    om_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=6 And SecuritiesCode like '%.HK'"
-    cursor.execute(gg_query)
+    cursor.execute(om_gg_query)
    cnx.commit()
-    gg_result = cursor.fetchall()
+    om_gg_result = cursor.fetchall()
-    gg_social_list = [item[0] for item in gg_result]
+    om_gg_social_list = [item[0] for item in om_gg_result]
-    print('=======')
+    print('欧盟港股企业=======')
-    for item in gg_social_list:
+    for item in om_gg_social_list:
        r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
+    fq_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"
+    cursor.execute(fq_gg_query)
+    cnx.commit()
+    fq_gg_result = cursor.fetchall()
+    fq_gg_social_list = [item[0] for item in fq_gg_result]
+    print('500强港股企业=======')
+    for item in fq_gg_social_list:
+        r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
+    fbs_gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"
+    cursor.execute(fbs_gg_query)
+    cnx.commit()
+    fbs_gg_result = cursor.fetchall()
+    fbs_gg_social_list = [item[0] for item in fbs_gg_result]
+    print('500强港股企业=======')
+    for item in fbs_gg_social_list:
+        r.rpush('NoticeEnterprise:ggqy_socialCode_add', item)
    closeSql(cnx, cursor)
@@ -612,7 +651,7 @@ if __name__ == "__main__":
    # BaseInfoEnterprise()
    # BaseInfoEnterpriseAbroad()
    # NewsEnterprise_task()
-    # NewsEnterprise()
+    NewsEnterprise()
    # CorPerson()
    # china100()
    # global100()
@@ -630,6 +669,6 @@ if __name__ == "__main__":
    # NoticeEnterprise_task()
    # AnnualEnterprise_task()
    # FinanceFromEast()
-    ipo_code()
+    # ipo_code()
    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')
--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
--- a/comData/YanBao/update_attid.py
+++ b/comData/YanBao/update_attid.py
@@ -23,10 +23,10 @@ es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn998
 index_name = 'researchreportdata'
 pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
-def searchATT(title):
+def searchATT():
-    sql = "select id from clb_sys_attachment where type_id=4 and name=%s "
+    sql = "select id from clb_sys_attachment where type_id=4 and create_time>'2023-12-08' "
    # lock.acquire()
-    cursor_.execute(sql, title+'.pdf')
+    cursor_.execute(sql)
    selects = cursor_.fetchone()
    # lock.release()
    return selects

--- a/comData/annualReport1023/test.py
+++ b/comData/annualReport1023/test.py
@@ -45,7 +45,7 @@ def sendKafka(dic_news,xydm):
    start_time = time.time()
    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
-        kafka_result = producer.send("researchReportTopic",
+        kafka_result = producer.send("researchReportYearTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
        print(kafka_result.get(timeout=10))

--- a/comData/annualReport1023/uptoes.py
+++ b/comData/annualReport1023/uptoes.py
@@ -31,7 +31,7 @@ pathType = 'QYYearReport/'
 def sendKafka(dic_news):
    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
-        kafka_result = producer.send("researchReportTopic",
+        kafka_result = producer.send("researchReportYearTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
        print(kafka_result.get(timeout=10))
@@ -93,74 +93,78 @@ def main():
    redis_conn = redis.Redis(connection_pool=pool)
    # info_ = redis_conn.lpop("NoIPO:info")
-    info_ = '91310000132206289R|1725799077425945040|2022'
+    info_list = ['91130100236018805C|18703781588|2018', '915203002147892034|18703781589|2013',
-    if info_:
+                 '913200007455797746|18703781592|2018', '91440500723817938W|18703781594|2019',
-        pass
+                 '91340000704920454F|18703781596|2021']
-    else:
+    for info_ in info_list:
-        log.info("++++已没有数据++++")
+        if info_:
-        return
+            pass
-    # info = info_.decode()
+        else:
-    info = info_
+            log.info("++++已没有数据++++")
-    xydm = info.split('|')[0]
-    att_id = info.split('|')[1]
+            return
-    year = info.split('|')[2]
+        # info = info_.decode()
-    if not xydm or not year:
+        info = info_
-        redis_conn.lpush('info', info)
+        xydm = info.split('|')[0]
-    else:
+        att_id = info.split('|')[1]
-        selects = secrchATT('1', xydm, year)
+        year = info.split('|')[2]
-        if len(selects) > 1:
+        if not xydm or not year:
-            redis_conn.lpush('NianBao:info', info)
+            redis_conn.lpush('info', info)
-        elif len(selects) == 1:
+        else:
-            # results = selectShortName(xydm)
+            selects = secrchATT('1', xydm, year)
-            # if results:
+            if len(selects) > 1:
-            #     pass
+                redis_conn.lpush('NianBao:info', info)
-            # else:
+            elif len(selects) == 1:
-            #     redis_conn.lpush('NoIPO:info', info)
+                # results = selectShortName(xydm)
-            #     return
+                # if results:
-            select = selects[0]
+                #     pass
-            # name = results[3]
+                # else:
-            name = select[1]
+                #     redis_conn.lpush('NoIPO:info', info)
-            if name:
+                #     return
-                # file_name = results[3] + ':' + year + '年年度报告'
+                select = selects[0]
-                file_name = name.split('.')[0]
+                # name = results[3]
-            else:
+                name = select[1]
-                redis_conn.lpush('Noname:info', info)
+                if name:
-                return
+                    # file_name = results[3] + ':' + year + '年年度报告'
-            log.info(f'-----------{file_name}-----------')
+                    file_name = name.split('.')[0]
-            origin = select[18]
+                else:
-            create_time = select[13]
+                    redis_conn.lpush('Noname:info', info)
-            publishDate = select[21]
+                    return
-            if publishDate == '2023-12-31':
+                log.info(f'-----------{file_name}-----------')
-                publishDate = '2023-08-31'
+                origin = select[18]
-            file_href = 'http://zzsn.luyuen.com' + str(select[5])
+                create_time = select[13]
-            content = getContent(file_href)
+                publishDate = select[21]
+                if publishDate == '2023-12-31':
-            lang = baseCore.detect_language(content)
+                    publishDate = '2023-08-31'
-            if lang == 'cn':
+                file_href = 'http://zzsn.luyuen.com' + str(select[5])
-                lang = 'zh'
+                content = getContent(file_href)
-            dic_info = {
-                'attachmentIds': att_id,
+                lang = baseCore.detect_language(content)
-                'author': '',
+                if lang == 'cn':
-                'content': content,
+                    lang = 'zh'
-                'contentWithTag': '',
+                dic_info = {
-                'createDate': str(create_time),
+                    'attachmentIds': att_id,
-                'deleteFlag': '0',
+                    'author': '',
-                'id': '',
+                    'content': content,
-                'keyWords': '',
+                    'contentWithTag': '',
-                'lang': lang,
+                    'createDate': str(create_time),
-                'origin': origin,
+                    'deleteFlag': '0',
-                'publishDate': publishDate,
+                    'id': '',
-                'sid': '1684032033495392257',
+                    'keyWords': '',
-                'sourceAddress': '',  # 原文链接
+                    'lang': lang,
-                'summary': '',
+                    'origin': origin,
-                'title': file_name,
+                    'publishDate': publishDate,
-                'type': 1,
+                    'sid': '1684032033495392257',
-                'socialCreditCode': xydm,
+                    'sourceAddress': '',  # 原文链接
-                'year': year
+                    'summary': '',
-            }
+                    'title': file_name,
-            sendKafka(dic_info)
+                    'type': 1,
-            time.sleep(1)
+                    'socialCreditCode': xydm,
+                    'year': year
+                }
+                sendKafka(dic_info)
+                time.sleep(1)
 def run_threads(num_threads):
    threads = []
@@ -178,7 +182,7 @@ def run_threads(num_threads):
 if __name__ == "__main__":
    while True:
        start = time.time()
-        num_threads = 5
+        num_threads =1
        run_threads(num_threads)
        log.info(f'5线程  总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/comData/ipoInfo/ipoInfo_newadd.py
+++ b/comData/ipoInfo/ipoInfo_newadd.py
+"""
+    将需要新增的企业入redis
+"""
+import json
+import time
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+import urllib3
+from base.BaseCore import BaseCore
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+baseCore = BaseCore()
+chromedriver = "./chromedriver"
+browser = webdriver.Chrome(chromedriver)
+taskType = '上市信息/东方财富网'
+log = baseCore.getLogger()
+error_list = []
+list_all_info = []
+# 需要提供股票代码、企业信用代码
+while True:
+    com_code1 = baseCore.redicPullData('Ipo_newsAdd:comCode')
+    start = time.time()
+    # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，8开头的为北京交易所
+    if com_code1[0] == '2' or com_code1[0] == '0' or com_code1[0] == '3':
+        com_code = 'sz' + com_code1
+    if com_code1[0] == '9' or com_code1[0] == '6':
+        com_code = 'sh' + com_code1
+    if com_code1[0] == '8' or com_code1[0] == '4':
+        com_code = 'bj' + com_code1
+    if com_code1[0] == 'A':
+        com_code = ''
+    log.info(f'======开始采集{com_code}======')
+    url = f'https://quote.eastmoney.com/{com_code}.html'
+    url_1 = f'https://emweb.eastmoney.com/PC_HSF10/CompanySurvey/PageAjax?code={com_code}'
+    url_2 = f'https://emweb.eastmoney.com/PC_HSF10/BusinessAnalysis/PageAjax?code={com_code}'
+    # browser.get(url)
+    # time.sleep(8)
+    # page_source = browser.page_source
+    # soup_t = BeautifulSoup(page_source, 'html.parser')
+    # try:
+    #     result = soup_t.find('div',class_='quote_quotenums').text
+    #     # print(f'result:{result}')
+    #     # if result=='未上市'or result=='已退市':
+    #     if result == '未上市' :
+    #         continue
+    #     if result == '已退市':
+    #         tag = 0
+    #     else:
+    #         tag = 1
+    # except Exception as e:
+    #     error_list.append(com_code)
+    #     log.info(f'={com_code}===解析上市状态失败=====')
+    #     state = 0
+    #     takeTime = baseCore.getTimeCost(start, time.time())
+    #     baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析上市状态失败--e:{e}')
+    #     print('error')
+    requests.adapters.DEFAULT_RETRIES = 5
+    json_1 = requests.get(url_1,verify=False).json()
+    json_2 = requests.get(url_2,verify=False).json()
+    # SECURITY_TYPE
+    try:
+        jys = json_1['jbzl'][0]['TRADE_MARKET']
+    except Exception as e:
+        log.info(f'====={com_code}=====解析交易所失败======')
+        state = 0
+        takeTime = baseCore.getTimeCost(start, time.time())
+        baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析交易所失败--e:{e}')
+        continue
+    try:
+        if "上海" in jys:
+            jys_code = '2'
+        if "深圳" in jys:
+            jys_code = '3'
+    except:
+        jys = json_1['jbzl'][0]['SECURITY_TYPE']
+        if "北京" in jys:
+            jys_code = '1'
+    short_name = json_1['jbzl'][0]['STR_NAMEA']
+    zhengquan_type = json_1['jbzl'][0]['SECURITY_TYPE']
+    # print(zhengquan_type)
+    if 'A' in zhengquan_type:
+        # print(zhengquan_type)
+        category = '1'
+    if 'B' in zhengquan_type:
+        category = '2'
+    if '新三板' in zhengquan_type:
+        category = '3'
+    if 'H' in zhengquan_type:
+        category = '4'
+    id_code = json_1['jbzl'][0]['REG_NUM']
+    dongcai = json_1['jbzl'][0]['EM2016']
+    zhengjian = json_1['jbzl'][0]['INDUSTRYCSRC1']
+    try:
+        shangshishijian = json_1['fxxg'][0]['LISTING_DATE'][:10]
+    except:
+        shangshishijian = ''
+    zhuyingfanwei = json_2['zyfw'][0]['BUSINESS_SCOPE']
+    dic_cwsj = {
+        "exchange": jys_code,
+        "category": category,  # 股票类型(1-A股;2-B股;3-新三板;4-H股)
+        'listed': '1',
+        "listingDate": shangshishijian,
+        "securitiesCode": com_code[2:],
+        "securitiesShortName": short_name,
+        "securitiesType": zhengquan_type,
+        "socialCreditCode": id_code,
+        "businessScope": zhuyingfanwei,
+        "eastIndustry": dongcai,
+        "csrcIndustry": zhengjian
+    }
+    list_all_info.append(dic_cwsj)
+    log.info(f'======{com_code}====采集成功=====')
+# 通过接口将数据保存进数据库
+for num in range(0, len(list_all_info),100):
+    json_updata = json.dumps(list_all_info[num:num+100])
+    # print(json_updata)
+    try:
+        response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
+                                 verify=False)
+    except Exception as e:
+        print(e)
+    print("{}：到：{}".format(num, num + 100))
+    print(response.text)
--- a/comData/policylaw/deletedup.py
+++ b/comData/policylaw/deletedup.py
@@ -31,26 +31,50 @@ class EsMethod(object):
    def queryatt(self,index_name,pnum):
       body = {
-                "size":0,
+                "query": {
-                "aggs":{
+    "bool": {
-                    "duplicate_titles":{
+      "must": [
-                        "terms":{
+        {
-                            "field":"sourceAddress.keyword",
+          "nested" : {
-                            "min_doc_count":2,
+          "query" : {
-                            "size":1000
+            "bool" : {
-                        },
+              "must" : [
-                        "aggs":{
+                {
-                            "duplicate_docs":{
+                  "match_phrase" : {
-                                "top_hits":{
+                    "labels.relationId" : {
-                                    "_source":{
+                      "query" : "1677"
-                                        "includes":["id","title","subjectId","sourceAddress","createDate"]
-                                    },
-                                "size":10
-                                }
-                            }
-                        }
                    }
+                  }
                }
+              ]
+            }
+          },
+          "path" : "labels"
+        }
+        }
+      ]
+    }
+  },
+  "size":0,
+  "aggs":{
+    "duplicate_titles":{
+      "terms":{
+          "field":"title.keyword",
+          "min_doc_count":2,
+          "size":1000
+        },
+      "aggs":{
+        "duplicate_docs":{
+          "top_hits":{
+            "_source":{
+              "includes":["id","title","subjectId","sourceAddress","createDate","labels.relationId","attachmentIds"]
+            },
+            "size":10
+          }
+        }
+      }
+    }
+  }
            }
       # filter_path = ['hits.aggregations.duplicate_titles',
@@ -66,6 +90,12 @@ class EsMethod(object):
       # log.info(result)
       return result
+    def delete(self, index_name, id):
+        result = self.es.delete(index=index_name
+                                , doc_type="_doc"
+                                , id=id)
+        log.info('删除结果 %s' % result)
 def main(page, p, esMethod):
    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
@@ -75,18 +105,10 @@ def main(page, p, esMethod):
        log.info('++++已没有数据+++++')
        return
    documents = result["aggregations"]["duplicate_titles"]["buckets"]
-    for bucket in documents:
+    unique_document_ids = [bucket["duplicate_docs"]["hits"]["hits"][-1]["_id"] for bucket in documents]
-        info_list = bucket["duplicate_docs"]["hits"]["hits"]
+    # 删除重复的文档
-        for info in info_list:
+    for doc_id in unique_document_ids:
+        esMethod.delete(index_name="policy", id=doc_id)
-            att_id_list = info['_source']['attachmentIds']
-            if len(att_id_list)==0:
-                unique_document_ids = info["_id"]
-                log.info(f'==={unique_document_ids}===')
-    # # 删除重复的文档
-    # for doc_id in unique_document_ids:
-    #     esMethod.delete(index="policy", id=doc_id)