企业年报入库

ecb8098f · 薛凌堃 · 605e48d4 · ecb8098f
--- a/comData/annualReport1014/report.py
+++ b/comData/annualReport1014/report.py
@@ -29,7 +29,7 @@ type_id = 1
 create_by = 'XueLingKun'
 taskType = '企业年报'
 #付俊雪的需要改为巨潮资讯网1_福布斯2000_PDF_60_付
-file_path = 'D:\\BaiduNetdiskDownload\\1_福布斯2000_PDF_60_付'
+file_path = 'D:\\年报\\失败'
 log.info(f'=============当前pid为{baseCore.getPID()}==============')
 def sendKafka(dic_news):
@@ -91,6 +91,7 @@ def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_path):
               'full_path': full_path,
               'category': category, 'file_size': file_size, 'status': status, 'create_by': create_by,
               'create_time': create_time, 'page_size': page_size, 'content': content}
    try:
        result = getOBSres(pathType, file_name, pdf_path)
    except:
@@ -128,7 +129,9 @@ if __name__=='__main__':
        file_rank = int(file.split('-')[0])
        file_year = file.split('-')[1]
+        if file_year== '2023':
+            print(pdf_path)
+            continue
        #file_rank 对应上企业信用代码
        selectsql = f"select * from rankandcode where id = {file_rank}"
        cursor.execute(selectsql)
@@ -137,8 +140,9 @@ if __name__=='__main__':
        social_code = data[1]
        ename = data[2]
        cname = data[3]
-        file_name = cname + ':' + file_year + '年年度报告' + '.pdf'
+        file_name = ename + ':' + file_year + '年年度报告' + '.pdf'
        content = ''
+        origin = ename + '官网'
        #解析文件页数和内容
        log.info(f"-----------正在处理{file_name}--------------")
        with open(pdf_path, 'rb') as file:
@@ -153,7 +157,7 @@ if __name__=='__main__':
                        content += page.get_text()
                    # print(content)
            except Exception as e:
-                log.info(f'文件已损坏:{cname}')
+                log.info(f'文件已损坏:{ename}')
                continue
        #解析文件大小
        file_size = os.path.getsize(pdf_path)
@@ -180,12 +184,12 @@ if __name__=='__main__':
                        'id': '',
                        'keyWords': '',
                        'lang': 'zh',
-                        'origin': '巨潮资讯网',
+                        'origin': origin,
                        'publishDate': file_year + '-12-31',
                        'sid': '1684032033495392257',
                        'sourceAddress': '',  # 原文链接
                        'summary': '',
-                        'title': file_name,
+                        'title': file_name.replace('.pdf',''),
                        'type': 1,
                        'socialCreditCode': social_code,
                        'year': file_year
@@ -198,6 +202,7 @@ if __name__=='__main__':
                        # 删除插入的数据 400表示发送数据失败
                        baseCore.deliteATT(att_id)
                        log.info(f'已删除插入附件表的数据---{file_name}-----{social_code}')
+                else:
+                    log.info(f'-----年报已存在--{social_code}--{file_name}-----')
        except Exception as e:
            log.info(f'error------{e}')
\ No newline at end of file