雪球网年报

13609e3d · 薛凌堃 · d30620e6 · 13609e3d
--- a/comData/annualReport/雪球网-年报.py
+++ b/comData/annualReport/雪球网-年报.py
 # -*- coding: utf-8 -*-
@@ -35,8 +35,10 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
 browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
 log = baseCore.getLogger()
 requests.adapters.DEFAULT_RETRIES = 3
+#11数据库
-cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
+#144数据库
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor
@@ -159,94 +161,92 @@ def spider_annual_report(dict_info,num):
        # name_pdf = f"{com_name}：{year}年年报.pdf".replace('*', '')
        # name_pdf = pdf_name_a + '.pdf'
-        with cnx.cursor() as cursor:
+        if '年' in year:
-            if '年' in year:
+            year = year.split('年')[0]
-                year = year.split('年')[0]
+        else:
-            else:
+            pass
+        sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
+        cursor.execute(sel_sql, (social_code, int(year)))
+        selects = cursor.fetchone()
+        if selects:
+            log.info(f'com_name:{com_name}、{year}已存在')
+            continue
+        else:
+            #上传文件至obs服务器
+            retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun')
+            if retData['state']:
                pass
-            sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
-            cursor.execute(sel_sql, (social_code, int(year)))
-            selects = cursor.fetchone()
-            if selects:
-                log.info(f'com_name:{com_name}、{year}已存在')
-                continue
            else:
-                #上传文件至obs服务器
+                log.info(f'====pdf解析失败====')
-                retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun')
+                continue
-                if retData['state']:
+            num = num + 1
-                    pass
+            try:
-                else:
+                origin = '雪球网'
-                    log.info(f'====pdf解析失败====')
+                att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time,origin)
-                    continue
+                content = retData['content']
-                num = num + 1
+                state = 1
-                try:
+                takeTime = baseCore.getTimeCost(start_time, time.time())
-                    origin = '雪球网'
+                baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
-                    att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time,origin)
+            except Exception as e:
-                    content = retData['content']
+                exception = '数据库传输失败'
-                    state = 1
+                state = 0
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
+                baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}')
-                except Exception as e:
+                return False
-                    exception = '数据库传输失败'
+            #发送数据到kafka
-                    state = 0
+            lang = baseCore.detect_language(content)
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
+            if lang == 'cn':
-                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}')
+                lang = 'zh'
-                    return False
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                #发送数据到kafka
+            dic_news = {
-                lang = baseCore.detect_language(content)
+                'attachmentIds': att_id,
-                if lang == 'cn':
+                'author': '',
-                    lang = 'zh'
+                'content': content,
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                'contentWithTag': '',
-                dic_news = {
+                'createDate': time_now,
-                    'attachmentIds': att_id,
+                'deleteFlag': '0',
-                    'author': '',
+                'id': '',
-                    'content': content,
+                'keyWords': '',
-                    'contentWithTag': '',
+                'lang': lang,
-                    'createDate': time_now,
+                'origin': origin,
-                    'deleteFlag': '0',
+                'publishDate': datetime_string,
-                    'id': '',
+                'sid': '1684032033495392257',
-                    'keyWords': '',
+                'sourceAddress': year_url,  # 原文链接
-                    'lang': lang,
+                'summary': '',
-                    'origin': origin,
+                'title': name_pdf.replace(',pdf', ''),
-                    'publishDate': datetime_string,
+                'type': 1,
-                    'sid': '1684032033495392257',
+                'socialCreditCode': social_code,
-                    'sourceAddress': year_url,  # 原文链接
+                'year': year
-                    'summary': '',
+            }
-                    'title': name_pdf.replace(',pdf', ''),
+            # 将相应字段通过kafka传输保存
-                    'type': 1,
+            try:
-                    'socialCreditCode': social_code,
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
-                    'year': year
+                kafka_result = producer.send("researchReportTopic",
-                }
+                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
-                # 将相应字段通过kafka传输保存
-                try:
-                    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
-                    kafka_result = producer.send("researchReportTopic",
-                                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
-                    print(kafka_result.get(timeout=10))
+                print(kafka_result.get(timeout=10))
-                    dic_result = {
+                dic_result = {
-                        'success': 'ture',
+                    'success': 'ture',
-                        'message': '操作成功',
+                    'message': '操作成功',
-                        'code': '200',
+                    'code': '200',
-                    }
+                }
-                    log.info(dic_result)
+                log.info(dic_result)
-                    # return True
+                # return True
-                except Exception as e:
+            except Exception as e:
-                    dic_result = {
+                dic_result = {
-                        'success': 'false',
+                    'success': 'false',
-                        'message': '操作失败',
+                    'message': '操作失败',
-                        'code': '204',
+                    'code': '204',
-                        'e': e
+                    'e': e
-                    }
+                }
-                    state = 0
+                state = 0
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
+                baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
-                    log.info(dic_result)
+                log.info(dic_result)
-                    return False
+                return False
-                # num = num + 1
+            time.sleep(2)
-                time.sleep(2)
            # browser.quit()
    return True