10/27

1bb5b282 · 薛凌堃 · fb61875d · 1bb5b282 · 1bb5b282 · 1bb5b282
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
--- a/comData/annualReport/fbs_annualreport.py
+++ b/comData/annualReport/fbs_annualreport.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    }
-    query = "select * from clb_sys_attachment where id= 383007"
+    query = "SELECT * FROM clb_sys_attachment WHERE type_id=1 AND source='证监会'"
    cursor_.execute(query)
    results = cursor_.fetchall()
    for result in results:
@@ -74,9 +74,10 @@ if __name__ == '__main__':
            pass
        else:
            com_name = selects[1]
-        full_path = 'http://114.115.215.96/' + result[6]
+        full_path = 'http://zzsn.luyuen.com/' + result[19]
        year = result[9]
        create_time = result[13]
+        publish = str(result[21])
        content = ''
        for i in range(0, 3):
            try:
@@ -102,9 +103,9 @@ if __name__ == '__main__':
            'id': '',
            'keyWords': '',
            'lang': detect_language,
-            'origin': com_name + '企业官网',
+            'origin': '证监会',
            # 'origin': '雪球网',
-            'publishDate': str(year) + '-12-31',
+            'publishDate': publish,
            'sid': '1684032033495392257',
            'sourceAddress': '',  # 原文链接
            'summary': '',

--- a/comData/annualReport/证监会-年报.py
+++ b/comData/annualReport/证监会-年报.py
 import json
@@ -125,8 +125,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
                    year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
                except Exception as e:
                    # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
-                    year = int(pub_time) - 1
+                    year = int(pub_time[:4]) - 1
-                    year = str(year)
+                    # year = str(year)
                # page_size = 0
@@ -322,7 +322,7 @@ if __name__ == '__main__':
        start_time = time.time()
        # 获取企业信息
        social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
-        # social_code = '91100000100003962T'
+        # social_code = '91210800765420138L'
        if not social_code:
            time.sleep(20)
            continue

--- a/comData/annualReport1014/report.py
+++ b/comData/annualReport1014/report.py
@@ -180,6 +180,7 @@ if __name__=='__main__':
                #retData, com_name, year, pdf_name, num, pub_time
                att_id= baseCore.tableUpdate(retData_f, cname,file_year,file_name, num,file_year+'-12-31',origin)
                if att_id:
+                    detect_language = baseCore.detect_language(content)
                    dic_news = {
                        'attachmentIds': att_id,
                        'author': '',
@@ -189,7 +190,7 @@ if __name__=='__main__':
                        'deleteFlag': '0',
                        'id': '',
                        'keyWords': '',
-                        'lang': 'zh',
+                        'lang': detect_language,
                        'origin': origin,
                        'publishDate': file_year + '-12-31',
                        'sid': '1684032033495392257',

--- a/comData/negative_news/creditchina.py
+++ b/comData/negative_news/creditchina.py
@@ -12,11 +12,14 @@
    pageSize: 10
 }
 """
+import json
 import time
 from urllib import parse
+import redis
 import requests
 from bs4 import BeautifulSoup
+from kafka import KafkaProducer
 from retry import retry
 from base.BaseCore import BaseCore
@@ -24,6 +27,41 @@ baseCore = BaseCore()
 log = baseCore.getLogger()
 cnx = baseCore.cnx
 cursor = baseCore.cursor
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
+taskType = '企业负面新闻'
+def sendKafka(dic_news):
+    start_time = time.time()
+    try:  # 114.116.116.241
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
+        kafka_result = producer.send("crawlerInfo",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        log.info(dic_result)
+        # 传输成功,写入日志中
+        state = 1
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, dic_news['title'], 'Kafka操作失败')
+        log.info(dic_result)
+        return False
 @retry(tries=3,delay=1)
 def getRequest(url,headers):
@@ -51,6 +89,11 @@ def dishonesty(headers,com_name,social_code):
    if json_data['status'] == 1:
        pass
    total_size = json_data['data']['totalSize']
+    if total_size > 0:
+        pass
+    else:
+        log.info(f'该企业{com_name}无严重失信信息')
+        return list_dishonesty
    for page in range(1,total_size+1):
        param_page = {
            'tableName': 'credit_zgf_fr_sxbzxr',
@@ -102,7 +145,9 @@ def dishonesty(headers,com_name,social_code):
                '数据来源':dataSource
            }
            list_dishonesty.append(dic_dishonesty)
-    return list_dishonesty
+            # r.sadd('dishonesty::' +social_code , )
+    return url,list_dishonesty
 # 行政处罚
 def punish(headers,com_name,social_code):
    list_punish = []
@@ -179,7 +224,7 @@ def punish(headers,com_name,social_code):
                '数据来源单位统一社会信用代码':cf_sjlydm
            }
            list_punish.append(dic_punish)
-    return list_punish
+    return url,list_punish
 # 经营异常
 def abnormal(headers,com_name,social_code):
@@ -204,8 +249,9 @@ def abnormal(headers,com_name,social_code):
    if total_size > 0:
        pass
    else:
-        log.info()
+        log.info(f'该企业{com_name}无经营异常信息')
-    for page in total_size:
+        return list_abhormal
+    for page in range(1, total_size+1):
        param_page = {
            'tableName': 'credit_xyzx_fr_xzcf_new',
            'searchState': '1',
@@ -242,8 +288,20 @@ def abnormal(headers,com_name,social_code):
                '数据来源':dataSource
            }
            list_abhormal.append(dic_abnormal)
-    return list_abhormal
+    return url,list_abhormal
+def dic_data(com_name,listData,type,detailurl):
+    dic_news = {
+        'title':com_name + type,
+        'structuredData':listData,
+        'ynStructure':1,
+        'content': '',
+        'contentHtml': '',
+        'source': '信用中国',
+        'publishtime': '',
+        'detailurl': detailurl,
+    }
+    return dic_news
 if __name__=='__main__':
@@ -259,11 +317,20 @@ if __name__=='__main__':
    }
    com_name = '石家庄交投集团工程服务有限责任公司'
    social_code = '91130100MA7EK14C8L'
-    # list_dishonesty = dishonesty(headers,com_name,social_code)
+    url_dishonesty,list_dishonesty = dishonesty(headers,com_name,social_code)
-    # print(list_dishonesty)
+    dic_dishonesty = dic_data(com_name,list_dishonesty,'严重违法失信信息',url_dishonesty)
-    list_punish = punish(headers,com_name,social_code)
+    sendKafka(dic_dishonesty)
-    print(list_punish)
-    # abnormal(headers,com_name,social_code)
+    url_punish,list_punish = punish(headers,com_name,social_code)
+    dic_punish = dic_data(com_name, list_punish, '行政处罚信息', url_punish)
+    # print(dic_punish)
+    sendKafka(dic_punish)
+    url_abnormal,list_abnormal = abnormal(headers,com_name,social_code)
+    dic_abnormal = dic_data(com_name, list_abnormal, '经营异常信息', url_abnormal)
+    # print(dic_abnormal)
+    sendKafka(dic_abnormal)
    # 报告链接
    # url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
    # report_json = getRequest(url_report, headers)
@@ -273,3 +340,4 @@ if __name__=='__main__':