提交 7117a3a5 作者: 李志超

合并分支 'dev' 到 'master'

5月份增加后台生成方式,并简化程序

查看合并请求 !5
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="Flask">
<option name="enabled" value="true" />
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="33">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/SCBG-PYTHON.iml" filepath="$PROJECT_DIR$/.idea/SCBG-PYTHON.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
......@@ -28,6 +28,7 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list
&& dpkg -i /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \
&& apt-get install -f \
&& rm /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \
# && ln -sf /usr/local/bin/python /usr/bin/python \
&& /usr/local/bin/python -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn \
&& pip install -r /opt/SCBG-PYTHON/requirements.txt -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com --no-cache-dir --default-timeout=10000 \
&& apt-get remove -y build-essential && apt-get clean \
......@@ -35,6 +36,12 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list
&& chmod u+x /opt/SCBG-PYTHON/start.sh
ENV HOME=/home/user
#ENV LANG=en_US.utf8
#ENV LC_ALL=en_US.utf8
#ENV PATH="/usr/local/bin/python:${PATH}"
EXPOSE 4000
WORKDIR '/opt/SCBG-PYTHON'
......@@ -42,6 +49,6 @@ WORKDIR '/opt/SCBG-PYTHON'
# CMD ["./start.sh"]
# c——告诉shell 运行后续命令, 此处是执行shell脚本,并将输出重定向到指定文件中
CMD ["sh", "-c", "/bin/bash start.sh | tee /opt/SCBG-PYTHON/start.log"]
CMD ["sh", "-c", "/bin/bash start.sh | tee /opt/SCBG-PYTHON/log/start.log"]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/3/20 11:43
# @Author : bruxellse_li
# @File : app_run.py
# @Project : 从word中提取指定表格
import time
import datetime
import os
from docx import Document
from flask import Flask, send_file, jsonify
from flask import request, Response
import requests
from extract_table import get_choose_table, get_other_table, get_other1_table
from extract_table import get_choose_table, get_other_table, get_other1_table, read_document, replace_document
from extract_factor import get_text_from_docx, Extract, get_cover_content_from_docx, Other_Extract
import json, re
from utils.log import logger
......@@ -15,25 +19,34 @@ import subprocess
from pathlib import Path
import traceback
from fdfs_client.client import *
from generate.gen_user_report_auto_generated import main_process
from generate.platform_generated import pl_process
# from generate.gen_user_report_auto_generated import new_main_process
from TextRewriting import get_list_result
from sentence_split import qx_correct, ner_correct
from utils.database_mysql import DatabaseMySQL
from multiy_area_copy_content import send_regenerate_report
import queue
# Queue基本FIFO队列 先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中,数据上限,小于或等于0则不限制,容器中大于这个数则阻塞,直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# 关闭多余连接
s = requests.session()
s.keep_alive = False
# 定义数据库链接基础信息
database_config = {
'host': '114.115.205.50',
# 'host': '114.116.44.11',
'host': '114.116.44.11',
'port': 3306,
'user': 'root',
# 'password': 'f7s0&7qqtK',
'password': 'yotop@123456',
'password': 'f7s0&7qqtK',
'database': 'clb_project'
}
dbm = DatabaseMySQL(database_config=database_config)
temp_url = "http://114.115.215.96/"
# temp_url = "http://114.115.215.96/"
temp_url = "http://dfs/"
"""
测试地址:http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus
正式地址:http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus
"""
# callback_url = "http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus"
callback_url = "http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus"
app = Flask(__name__)
UPLOAD_FOLDER = r'data/' # 上传路径
......@@ -60,38 +73,71 @@ def doc2docx(doc_path, docx_path):
# 使用LibreOffice将doc文件转换为docx文件
subprocess.call(['libreoffice', '--headless', '--convert-to', 'docx', doc_path, '--outdir', os.path.dirname(docx_path)])
# 将转换后的docx文件重命名为目标文件名
os.rename(os.path.splitext(doc_path)[0] + '.docx', docx_path)
# os.rename(os.path.splitext(doc_path)[0] + '.docx', docx_path) # zhangtao 2023-05-12
@app.route("/", methods=["GET"])
def hello_world():
app.logger.info('Hello World!')
return "Hello World"
@app.route('/subject_consumer', methods=['GET', 'POST'])
def subject_consumer():
if not q.empty():
config_info = q.get()
logger.info(config_info)
return jsonify(message='当前队列数量:' + str(q.qsize()),
queue_left_number=str(q.qsize()),
data=config_info)
else:
return jsonify(message='队列为空!', queue_left_number=0)
@app.route('/queue_size', methods=['GET', 'POST'])
def queue_size():
return jsonify(queue_left_number=q.qsize())
@app.route('/extract_special_table', methods=['POST'])
def extract_special_table():
# table_params ——['资产负债表', '收入费用表(1)', '收入费用表(2)']
data = request.get_json()
file_request = data["path"]
filename = file_request.split("/")[-1]
if ".doc" in file_request:
r = requests.get(file_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, filename), "wb") as f:
template_request = data["path"]
logger.info(template_request)
# filename = file_request.split("/")[-1]
template_filename = os.path.split(template_request)[-1]
# 文件先下载再判断是否转换
if str(template_filename).endswith(".doc"):
r = requests.get(template_request, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4doc = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4doc, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
doc_path = os.path.join(UPLOAD_FOLDER, filename)
# 获取文件路径前缀
new_path = os.path.splitext(doc_path)[0] + '.docx'
# 将doc转换为docx
doc2docx(doc_path, new_path)
# temp_template_path = os.path.join(UPLOAD_FOLDER, template_filename)
# # 获取文件路径前缀
# template_path = os.path.splitext(temp_template_path)[0] + '.docx'
# # 将doc转换为docx
save_path4docx = '{}.docx'.format(os.path.splitext(save_path4doc)[0])
doc2docx(save_path4doc, save_path4docx)
template_path = save_path4docx
document = Document(new_path)
elif ".docx" in file_request:
r = requests.get(file_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, filename), "wb") as f:
elif ".docx" in template_request:
r = requests.get(template_request, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4docx = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4docx, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
document = Document(os.path.join(UPLOAD_FOLDER, filename))
# template_path = os.path.join(UPLOAD_FOLDER, template_filename)
template_path = save_path4docx
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
document = Document(template_path)
table_names = ["以名义金额计量的资产名称、数量等情况,以及以名义金额计量理由的说明"]
data_result = get_other_table(document, table_names)
if data_result["以名义金额计量的资产名称、数量等情况,以及以名义金额计量理由的说明"]:
......@@ -100,7 +146,7 @@ def extract_special_table():
temp_table_result = get_other_table(document, ["本单位无以名义金额计量的资产"])
del data_result["以名义金额计量的资产名称、数量等情况,以及以名义金额计量理由的说明"]
data_result.update(temp_table_result)
os.remove(os.path.join(UPLOAD_FOLDER, filename))
# os.remove(os.path.join(UPLOAD_FOLDER, filename))
return Response(json.dumps(data_result, ensure_ascii=False), content_type='application/json')
......@@ -109,6 +155,7 @@ def extract_special_table1():
# table_params ——['资产负债表', '收入费用表(1)', '收入费用表(2)']
data = request.get_json()
file_request = data["path"]
logger.info(file_request)
filename = file_request.split("/")[-1]
if ".doc" in file_request:
r = requests.get(file_request, stream=True)
......@@ -135,7 +182,7 @@ def extract_special_table1():
table_names = ['货币资金明细信息如下']
data_result = get_other1_table(document, table_names)
os.remove(os.path.join(UPLOAD_FOLDER, filename))
# os.remove(os.path.join(UPLOAD_FOLDER, filename))
return Response(json.dumps(data_result, ensure_ascii=False), content_type='application/json')
......@@ -144,6 +191,7 @@ def extract_table():
# table_params ——['资产负债表', '收入费用表(1)', '收入费用表(2)']
data = request.get_json()
file_request = data["path"]
logger.info(file_request)
filename = file_request.split("/")[-1]
if ".doc" in file_request:
r = requests.get(file_request, stream=True)
......@@ -152,7 +200,7 @@ def extract_table():
f.write(chunk)
doc_path = os.path.join(UPLOAD_FOLDER, filename)
print(doc_path)
# print(doc_path)
# 获取文件路径前缀
new_path = os.path.splitext(doc_path)[0] + '.docx'
# 将doc转换为docx
......@@ -229,7 +277,7 @@ def extract_table():
continue
else:
temp_1["上级项目"] = temp_dict_1["temp_key"]
os.remove(os.path.join(UPLOAD_FOLDER, filename))
# os.remove(os.path.join(UPLOAD_FOLDER, filename))
return Response(json.dumps(data, ensure_ascii=False), content_type='application/json')
......@@ -238,6 +286,7 @@ def extract_table():
def extract_factor():
data = request.get_json()
file_request = data["path"]
logger.info(file_request)
filename = file_request.split("/")[-1]
if ".doc" in file_request:
r = requests.get(file_request, stream=True)
......@@ -262,7 +311,7 @@ def extract_factor():
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
data = Extract().extract_result(document)
os.remove(os.path.join(UPLOAD_FOLDER, filename))
# os.remove(os.path.join(UPLOAD_FOLDER, filename))
return Response(json.dumps(data, ensure_ascii=False), content_type='application/json')
......@@ -275,67 +324,121 @@ def generate_report():
data_object: 待填充数据
:return:
"""
data = request.get_json()
template_request = data["template_path"]
doc_request = data["document_path"]
report_name = data["report_name"] + ".docx"
template_url = data["template_path"] # 模板 下载链接
half_document_url = data["document_path"] # 半成品 下载链接
logger.info(template_url)
logger.info(half_document_url)
report_id = data["report_id"]
template_id = data["template_id"]
report_name = data["report_name"]
data_object = data["object"]["data_object"]
tables_dict = data["object"]["tables_dict"]
current_filename = time.strftime('%Y_%m_%d-%H_%M_%S') + ".docx"
save_path = UPLOAD_FOLDER + "/" + current_filename
# 先判断是否是docx 格式
template_filename = template_request.split("/")[-1]
# 文件先下载再判断是否转换
if ".doc" in template_request:
r = requests.get(template_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, template_filename), "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
temp_template_path = os.path.join(UPLOAD_FOLDER, template_filename)
# 获取文件路径前缀
template_path = os.path.splitext(temp_template_path)[0] + '.docx'
# 将doc转换为docx
doc2docx(temp_template_path, template_path)
elif ".docx" in template_request:
r = requests.get(template_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, template_filename), "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
template_path = os.path.join(UPLOAD_FOLDER, template_filename)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
doc_filename = doc_request.split("/")[-1]
if ".doc" in doc_request:
r1 = requests.get(doc_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, doc_filename), "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
temp_doc_path = os.path.join(UPLOAD_FOLDER, doc_filename)
# 获取文件路径前缀
doc_path = os.path.splitext(temp_doc_path)[0] + '.docx'
# 将doc转换为docx
doc2docx(temp_doc_path, doc_path)
half_work_path = doc_path
elif ".docx" in doc_request:
r1 = requests.get(doc_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, doc_filename), "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
half_work_path = os.path.join(UPLOAD_FOLDER, doc_filename)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
try:
# 先判断是否是docx 格式
# template_filename = template_request.split("/")[-1]
template_filename = os.path.split(template_url)[-1] # todo: url 切分是否对???
# 文件先下载再判断是否转换
if str(template_filename).endswith(".doc"):
r = requests.get(template_url, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4doc = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4doc, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
save_path4docx = '{}.docx'.format(os.path.splitext(save_path4doc)[0])
doc2docx(save_path4doc, save_path4docx)
template_path = save_path4docx # 本地路径
elif str(template_filename).endswith(".docx"):
r = requests.get(template_url, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4docx = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4docx, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
# template_path = os.path.join(UPLOAD_FOLDER, template_filename)
template_path = save_path4docx
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
# doc_filename = doc_request.split("/")[-1]
doc_filename = os.path.split(half_document_url)[-1] # todo: url 切分是否对???
if str(doc_filename).endswith(".doc"):
r1 = requests.get(half_document_url, stream=True)
my_timestamp2 = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4doc2 = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp2, doc_filename))
with open(save_path4doc2, "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
# temp_doc_path = os.path.join(UPLOAD_FOLDER, doc_filename)
# # 获取文件路径前缀
# doc_path = os.path.splitext(temp_doc_path)[0] + '.docx'
save_path4docx2 = '{}.docx'.format(os.path.splitext(save_path4doc2)[0])
# 将doc转换为docx
doc2docx(save_path4doc2, save_path4docx2)
# half_work_path = doc_path
half_document_path = save_path4docx2
elif str(doc_filename).endswith(".docx"):
r1 = requests.get(half_document_url, stream=True)
my_timestamp2 = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
half_document_path = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp2, doc_filename))
with open(half_document_path, "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
# todo: 调整为
output_report_path = os.path.join(UPLOAD_FOLDER,
'{}_{}.docx'.format(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
report_name))
# todo: 先将任务放入后台进行处理
config_info = {
"report_id": report_id,
"template_id": template_id,
"half_document_path": half_document_path,
"tables_dict": tables_dict,
"template_path": template_path,
"data_object": data_object,
"output_report_path": output_report_path,
"report_name": report_name
}
q.put(config_info)
logger.info(config_info)
main_process(half_work_path, tables_dict, template_path, report_name, data_object, save_path)
os.remove(os.path.join(UPLOAD_FOLDER, template_filename))
os.remove(os.path.join(UPLOAD_FOLDER, doc_filename))
send_path = os.path.join(UPLOAD_FOLDER, report_name)
# "data/财务报告.docx"
ret_upload = client.upload_by_filename(send_path)
return ret_upload["Remote file_id"]
except Exception as e:
dict_result = {
"status": "1", # 处理状态 (0 成功, 1 失败)
"result": "生成失败!+ {}".format(str(e)),
"id": report_id, # 报告id
"templeteId": template_id, # 报告模板
"filePath": ""
}
# todo: 调用java的状态更新接口返回异常的结果
payload = json.dumps(dict_result)
headers = {
'Content-Type': 'application/json'
}
"""
测试地址:http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus
正式地址:http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus
"""
# callback_url = "http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus"
r1 = requests.post(url=callback_url,
headers=headers, data=payload)
r1_json = json.loads(r1.text)
logger.info(r1_json)
return dict_result
return "Success"
@app.route('/gx_app', methods=['POST'])
......@@ -394,98 +497,6 @@ def qx_error():
'correctContent': content}})
@app.route('/platform_report', methods=['POST'])
def platform_report():
"""
task_id: 任务id
:return:
"""
data = request.get_json()
task_id = data["task_id"].strip()
# todo: 基于任务id来获取数据信息
# dataset_sql = '''SELECT ds.id,ds.param_value,te.file_path FROM clb_report_task t inner join clb_report_template te on t.template_id = te.id
# inner join clb_report_data_set ds on te.data_set_id = ds.id
# where t.id = {};'''.format(task_id)
dataset_sql = """SELECT ds.id,te.file_path FROM clb_report_task t inner join clb_report_template te on t.template_id = te.id
inner join clb_report_data_set ds on te.data_set_id = ds.id
where t.id = {};""".format(task_id)
dbm = DatabaseMySQL(database_config=database_config)
dataset_result = dbm.query(sql=dataset_sql)[0]
dataset_id, temp_path = dataset_result["id"], dataset_result["file_path"]
# dataset_id, param_value, temp_path = dataset_result["id"], dataset_result["param_value"], dataset_result[
# "file_path"]
# print(type(param_value))
# param_value = json.loads(param_value)
# todo: 再基于数据集id 获取数据集地址,参数,返回数据类型,数据对象
# data_source_sql = """select ds.url,ds.params,ds.type,ds.data_name from clb_report_data_source ds inner join clb_report_data_set_source_map m on ds.id = m.data_source_id
# where m.data_set_id = {};""".format(dataset_id)
data_source_sql = """select ds.url,m.param_value,ds.type,m.return_object from clb_report_data_source ds inner join clb_report_data_set_source_map m on ds.id = m.data_source_id
where m.data_set_id = {};""".format(dataset_id)
datasource_result_list = dbm.query(sql=data_source_sql)
# 关闭数据库连接
# dbm.close()
# todo: 第一步:基于模板路径和模板url 获取下载模板链接
template_request = temp_url + "/" + temp_path
# 先判断是否是docx 格式
template_filename = template_request.split("/")[-1]
# print(template_filename)
# print(template_request)
# 文件先下载再判断是否转换
if ".doc" in template_request:
r = requests.get(template_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, template_filename), "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
temp_template_path = os.path.join(UPLOAD_FOLDER, template_filename)
# 获取文件路径前缀
template_path = os.path.splitext(temp_template_path)[0] + '.docx'
# 将doc转换为docx
doc2docx(temp_template_path, template_path)
elif ".docx" in template_request:
r = requests.get(template_request, stream=True)
with open(os.path.join(UPLOAD_FOLDER, template_filename), "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
template_path = os.path.join(UPLOAD_FOLDER, template_filename)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
# todo: 第二步:基于数据源信息获取数据对象
dict_data = dict()
for datasource_result in datasource_result_list:
dataset_url = datasource_result["url"]
params = datasource_result["param_value"]
# 4月23号调整
dict_param = json.loads(params)
# {"id": 2, "name": "nihao"}
connect_list = []
for key, value in dict_param.items():
connect_param = str(key) + "=" + str(value)
connect_list.append(connect_param)
# params_list = params.split(",")
# connect_list = []
# for temp_param in params_list:
# connect_param = temp_param + "=" + str(param_value[temp_param])
# connect_list.append(connect_param)
request_str = "&".join(connect_list)
dataset_request = dataset_url + "?" + request_str
str_dataset_info = requests.get(dataset_request, stream=True)
logger.info(str_dataset_info.content)
dataset_info = json.loads(str_dataset_info.content)
data_name = datasource_result["return_object"]
dict_data[data_name] = dataset_info["result"]
logger.info(dict_data)
# 定义平台临时文件名
report_name = "平台模板样例报告.docx"
pl_process(template_path, dict_data, report_name)
os.remove(os.path.join(UPLOAD_FOLDER, template_filename))
send_path = os.path.join(UPLOAD_FOLDER, report_name)
# "data/财务报告.docx"
ret_upload = client.upload_by_filename(send_path)
return ret_upload["Remote file_id"]
@app.route('/extract_cover_factor', methods=['POST'])
def extract_cover_factor():
data = request.get_json()
......@@ -524,10 +535,104 @@ def extract_cover_factor():
other_extract = Other_Extract()
other_data = other_extract.extract_other_result(other_contents)
other_data["reportTitle"] = cover_text
os.remove(os.path.join(UPLOAD_FOLDER, filename))
# os.remove(os.path.join(UPLOAD_FOLDER, filename))
return Response(json.dumps(other_data, ensure_ascii=False), content_type='application/json')
@app.route('/generate_final_report', methods=['POST'])
def generate_final_report():
"""
template_path : 模板文件下载地址
document_path: 半成品文件下载地址
report_name: 报告名称
data_object: 待填充数据
:return:
"""
data = request.get_json()
template_url = data["template_path"] # 第二次生成时的模板 下载链接
half_document_url = data["document_path"] # 第二次生成时的半成品,即第一次生成的报告 下载链接
logger.info(template_url)
logger.info(half_document_url)
report_id = data["report_id"]
report_name = data["report_name"]
data_object = data["object"]["data_object"]
# todo: 处理模板
# 先判断是否是docx 格式
template_filename = template_url.split("/")[-1]
# 文件先下载再判断是否转换
if str(template_filename).endswith('.doc'):
r = requests.get(template_url, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4doc = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4doc, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
save_path4docx = '{}.docx'.format(os.path.splitext(save_path4doc)[0])
doc2docx(save_path4doc, save_path4docx)
template_path = save_path4docx # 本地路径
elif str(template_filename).endswith(".docx"):
r = requests.get(template_url, stream=True)
my_timestamp = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4docx = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp, template_filename))
with open(save_path4docx, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
template_path = save_path4docx
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
# todo: 处理半成品
half_document_filename = os.path.split(half_document_url)[-1] # todo: url 切分是否对???
if str(half_document_filename).endswith(".doc"):
r1 = requests.get(half_document_url, stream=True)
my_timestamp2 = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
save_path4doc2 = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp2, half_document_filename))
with open(save_path4doc2, "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
save_path4docx2 = '{}.docx'.format(os.path.splitext(save_path4doc2)[0])
# 将doc转换为docx
doc2docx(save_path4doc2, save_path4docx2)
half_document_path = save_path4docx2
elif str(half_document_filename).endswith(".docx"):
r1 = requests.get(half_document_url, stream=True)
my_timestamp2 = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
half_document_path = os.path.join(UPLOAD_FOLDER, '{}_{}'.format(my_timestamp2, half_document_filename))
with open(half_document_path, "wb") as f1:
for chunk in r1.iter_content(chunk_size=512):
f1.write(chunk)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
# todo: 调整为
half_document = Document(half_document_path)
template_document = Document(template_path)
output_report_path = os.path.join(UPLOAD_FOLDER,
'{}_{}.docx'.format(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
report_name))
report_processed_path = send_regenerate_report(half_document=half_document,
template_document=template_document,
output_report_path=output_report_path,
data_object=data_object,
report_id=report_id)
# # todo: 在第一个报告中已经处理好,此处无需继续处理
# document = Document(temp_send_path)
# read_document(document, "\u00A0", "")
# final_report_name = data["report_name"] + str(1001) + ".docx"
# send_path = os.path.join(UPLOAD_FOLDER, final_report_name)
# document.save(send_path)
# "data/财务报告.docx"
ret_upload = client.upload_by_filename(report_processed_path)
return ret_upload["Remote file_id"]
if __name__ == '__main__':
app.config['JSON_AS_ASCII'] = False
app.config['JSONIFY_MIMETYPE'] = "application/json;charset=utf-8"
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 文章内容检查.py
# @Time : 2022/12/9 16:05
# @Author : bruxelles_li
# @Software: PyCharm
from bs4 import BeautifulSoup
import re
punctuation = re.compile(r'[\n0-9a-zA-Z、!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、'
r'〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.!"#$%&\'()*+,\-'
r'./:;<=>?@\[\]\\^_`{|}~一二三四五六七八九十《]')
# punctuation = re.compile(r'[0-9]')
def is_punctuation(text):
punctuation_pattern = re.compile(punctuation)
if punctuation_pattern.sub('', text):
return False
else:
return True
# 程序返回处理函数
def clean_html_tag(content):
text = content
bs = BeautifulSoup(text, 'lxml')
temp = []
match_content = bs.text.strip()
# 初步清洗文中的空白符,杂乱符号
pattern = re.compile(
'[#$*$<=>@●▍[\]△▲^_`■▋{|}~⦅⦆ф「」\u3000〈〉《》「」『』【】※〔〕〖〗〘〙〚〛〜〰〾〿\*〈〉]')
match_content0 = pattern.sub('', match_content)
match_content1 = re.sub(r"(阅读提示|点击 上方文字 关注我们 |点击 上方文字 关注我们|点击蓝字丨关注我们|点击蓝字 关注我们|- THE END - |◀——|-)", "", match_content0)
match_content2 = re.sub(r"(?=\(图片[::]).+(?<=\))", "", match_content1)
match_content3 = re.sub(r"&mdash&mdash", "&mdash", match_content2)
match_content4 = re.sub(r"&mdash", "&", match_content3)
match_content5 = re.sub(r"       ", "", match_content4)
match_content6 = re.sub(r"(?=\().*(?<=图\))", "", match_content5)
match_content7 = re.sub(r'。"', "。”", match_content6)
match_content8 = re.sub(r"(。;|。,)", "。", match_content7)
match_content9 = re.sub(r"(\\t|\\)", "", match_content8)
list_content = match_content9.split('\n')
temp_content = []
for text in list_content:
if len(text) <= 2:
continue
else:
text = text.strip()
if text.endswith("。") or text.endswith("“") or text.endswith(".") or text.endswith('”'):
text = text
else:
text = text + "\t" + "\t"
text = re.sub(r".*(?<=记者).*(?<=摄)", "", text)
temp_content.append(text)
# print(temp_content)
str_content = "\t".join(temp_content)
a = re.sub('\t\t\t', '——', str_content)
a0 = re.sub('\t\t', '', a)
a1 = re.sub(r":——", ":", a0)
a2 = re.sub(r"。)", ")", a1)
a3 = re.sub(r"(。”|!”)", "”", a2)
b = re.sub("\t", "\n", a3).strip()
c = b.split('\n')
# print(len(c))
for d in c:
e = d.strip('&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;')
# 去除中间包含指定字符所在的句子
f = re.sub("(微信|如需转载|免责声明|公告|jpeg|jpg|png|声明:|附件:|责任单位:|编辑:).*?(?<=。)", '', str(e))
# 删除邮箱所在行
g = re.sub(".*(?=\.com|www\.).*", "", f)
# print(g)
if len(g) <= 20:
continue
else:
temp.append(g)
h = "\t".join(temp)
j = re.sub("\t(?=而|但|对于|此外|因此|与此同时|这种|基于此|但是|然而)", "", h)
new_content = re.sub("\t", "\n", j)
new_content_list = new_content.split("\n")
final_content_list = []
for k in new_content_list:
k = " " + k
# 先去除中间包含javascript、html所在的段落内容
l = re.sub(".*(function。|html|background|javascript|image).*", '', k)
if l:
final_content_list.append(l.strip("——"))
final_content = "\n\n".join(final_content_list) if len(final_content_list) >= 10 else "".join(final_content_list)
return final_content
if __name__ == "__main__":
text = """工业和信息化部◀——◀——◀——◀——◀—— 人力资源社会保障部 生态环境部 商务部 市场监管总局
持续健全市场化运营体制机制,守好安全生产底线红线,推进绿色低碳科技研发应用,为实现碳达峰碳中和目标贡献力量。把坚持党的领导加强党的建设融入公司治理,凝聚各方面工作合力,努力开创公司改革发展新局面。(图片:孚能科技将绿色发展融入企业成长,并带动产业链协同提昇晉昇,提拔)
"""
print(clean_html_tag(content=text))
......@@ -21,8 +21,8 @@ from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph
from docx.shared import Pt
# 定义待复制内容的匹配模式
start_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明)$')
end_pattern = re.compile(r'(?<=[0-9][\..]需要说明的其他事项)$|(?<=[0-9][\..]需要说明的其他事项。)(略)$')
start_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明)$|(?<=[0-9][\..]会计报表重要项目的明细信息及说明。)$')
end_pattern = re.compile(r'(?<=[0-9][\..]需要说明的其他事项)$|(?<=[0-9][\..]需要说明的其他事项。)(略)$|(?<=[0-9][\..]需要说明的其他事项[。\.])$')
def iter_block_items(parent):
......@@ -129,10 +129,92 @@ def copy_content_main(doc_path: str, temp_path: str):
return None
# todo: 先复制内容到模板中,保存更新后的模板
def new_copy_content_main(doc_document, template_document):
doc = doc_document
# 新建临时文档
new_doc = Document()
start_found = False
end_found = False
for element in doc.element.body.xpath("w:p | w:tbl"):
if isinstance(element, CT_P):
para = Paragraph(element, doc)
start_results = re.findall(start_pattern, para.text)
if start_results:
# print(para.text)
start_found = True
continue
if isinstance(element, CT_P):
para = Paragraph(element, doc)
end_results = re.findall(end_pattern, para.text)
if end_results:
# print(para.text)
end_found = True
break
# 复制文本段落
if start_found and not end_found and isinstance(element, CT_P):
para = Paragraph(element, doc)
new_doc.add_paragraph(para.text)
# 复制表格
if start_found and not end_found and isinstance(element, CT_Tbl):
table = Table(element, doc)
new_table = deepcopy(table._element)
# 在目标文档添加一个空段落
new_doc.add_paragraph('')
# 获取新段落
new_paragraph = new_doc.paragraphs[-1]
# 在新段落中添加表格
new_paragraph._element.addprevious(new_table)
# 遍历文档中的段落,去除多余的空白段落
for para in new_doc.paragraphs:
# 使用正则表达式匹配空白段落(只包含空格和换行符)
if re.match(r'^\s*$', para.text):
# 删除空白段落
new_doc._element.body.remove(para._element)
# 获取待插入内容在目标文档中的位置
# source_doc = Document(temp_path)
source_doc = template_document
start_index = None
for index, para in enumerate(source_doc.paragraphs):
start_result = re.findall(start_pattern, para.text)
if start_result:
start_index = index
break
target_paragraph = source_doc.paragraphs[start_index]
# 遍历源文档中的所有元素
for element in reversed(new_doc.element.body):
# 如果是段落,就在目标段落之后添加
if isinstance(element, CT_P):
# 考虑样式发生变化,对此进行调整
para = Paragraph(element, doc)
# 设置字体和字号
new_para = source_doc.add_paragraph(para.text, style='Normal')
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1,
new_para._element)
# 如果是表格,也在目标段落之后添加
elif isinstance(element, CT_Tbl):
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1, element)
# source_doc.save(temp_path)
return source_doc
if __name__ == '__main__':
doc_path = "data/3月23测试半成品.docx"
doc_path = "data/2022年度德阳市旌阳区人民法院(1).docx"
# doc_path = 'data/特殊教育学校(1).docx'
temp_path = "data/new_财务报告模板.docx"
temp_path = "data/财务报告模板(2).doc"
copy_content_main(doc_path, temp_path)
# docx_file = r'wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx'
# doc = Document(docx_file)
......
......@@ -110,8 +110,8 @@ def new_document():
return para._p
def generate_report(table_names_data, save_path, template_path, tables_dict):
document = Document(template_path)
def generate_report(table_names_data, template_document, tables_dict):
document = template_document
pattern = re.compile(r'(?<={{).*?(?=}})')
# block 块对象主要包括标题、段落、图片、表、列表
# run 内联对象为块对象的组成部分,块对象的所有内容都包含在内联对象中,一个块对象由一个或多个内联对象组成。修改字体、字号、文字颜色需要用到run
......@@ -139,7 +139,41 @@ def generate_report(table_names_data, save_path, template_path, tables_dict):
p = block._element
p.getparent().remove(p)
block._p = block._element = None
document.save(save_path)
# document.save(save_path)
return document
def new_generate_report(table_names_data, template_document, tables_dict):
document = template_document
pattern = re.compile(r'(?<={{).*?(?=}})')
# block 块对象主要包括标题、段落、图片、表、列表
# run 内联对象为块对象的组成部分,块对象的所有内容都包含在内联对象中,一个块对象由一个或多个内联对象组成。修改字体、字号、文字颜色需要用到run
# for block in iter_block_items(document):
for block in document.paragraphs:
if isinstance(block, Paragraph):
match = pattern.findall(block.text)
if match and "table" in match[0]:
table_name = match[0]
for _ in table_names_data[tables_dict[table_name]]:
# white_row = new_document()
# 在XML 级别上进行操作,即在元素之后直接添加内容,将任何尾部文本移动到新插入的元素后面,目的是使得新元素成为紧随其后的兄弟元素
# block._p.addnext(white_row)
block._p.addnext(_)
p = block._element
p.getparent().remove(p)
block._p = block._element = None
# 清除模板定义中的续表
pattern_clear = re.compile(r'(?<=续表)[0-9]')
for block in iter_block_items(document):
if isinstance(block, Paragraph):
match = pattern_clear.findall(block.text)
if match:
p = block._element
p.getparent().remove(p)
block._p = block._element = None
# document.save(save_path)
return document
if __name__ == '__main__':
......@@ -147,7 +181,6 @@ if __name__ == '__main__':
start_time = datetime.datetime.now()
# 参数:tables_dict、docx_file、save_path、template_path
tables_dict = {
"table13": "以名义金额计量的资产名称、数量等情况,以及以名义金额计量理由的说明",
"table5": "收入费用表(2)",
"table4": "收入费用表(1)",
"table3": "资产负债表续表2",
......@@ -168,5 +201,5 @@ if __name__ == '__main__':
document = Document(docx_file)
data_result = get_choose_table(document, list(tables_dict.values()))
print(data_result)
generate_report(data_result, save_path=r'data/报告文件.docx', template_path=r'data/new_财务报告模板.docx', tables_dict=tables_dict)
# generate_report(data_result, save_path=r'data/报告文件.docx', template_path=r'data/new_财务报告模板.docx', tables_dict=tables_dict)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 资源检测程序.py
# @Time : 2022/9/30 10:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import os, time, re, subprocess
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
# 获取内存负载信息
def get_mem_usage_percent():
try:
f = open('/proc/meminfo', 'r')
for line in f:
if line.startswith('MemTotal:'):
mem_total = int(line.split()[1])
elif line.startswith('MemFree:'):
mem_free = int(line.split()[1])
elif line.startswith('Buffers:'):
mem_buffer = int(line.split()[1])
elif line.startswith('Cached:'):
mem_cache = int(line.split()[1])
elif line.startswith('SwapTotal:'):
vmem_total = int(line.split()[1])
elif line.startswith('SwapFree:'):
vmem_free = int(line.split()[1])
else:
continue
f.close()
except:
return None
physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)
virtual_percent = 0
if vmem_total > 0:
virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)
return physical_percent, virtual_percent
def usage_percent(use, total):
try:
ret = (float(use) / total) * 100
except ZeroDivisionError:
raise Exception("ERROR - zero division error")
return ret
# 获取磁盘根目录占用信息
def disk_info():
statvfs = os.statvfs('/') # 根目录信息 可根据情况修改
total_disk_space = statvfs.f_frsize * statvfs.f_blocks
free_disk_space = statvfs.f_frsize * statvfs.f_bfree
disk_usage = (total_disk_space - free_disk_space) * 100.0 / total_disk_space
disk_usage = int(disk_usage)
# disk_tip = "硬盘空间使用率(最大100%):" + str(disk_usage) + "%"
# print(str(disk_usage))
return str(disk_usage)
# 获取内存占用信息
def mem_info():
mem_usage = get_mem_usage_percent()
mem_usage = int(mem_usage[0])
# mem_tip = "物理内存使用率(最大100%):" + str(mem_usage) + "%"
# print(str(mem_usage))
return str(mem_usage)
# 获取CPU占用信息
def cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
# 获取系统占用信息
def sys_info():
load_average = os.getloadavg()
# print(len(load_average))
# load_tip = "系统负载(三个数值中有一个超过3就是高):" + str(load_average)
return len(load_average)
# 获取计算机当前时间
def time_info():
now_time = time.strftime('%Y-%m-%d %H:%M:%S')
return "主机的当前时间:%s" % now_time
# 获取计算机主机名称
def hostname_info():
hostnames = os.popen("hostname").read().strip()
return "你的主机名是: %s" % hostnames
# 获取IP地址信息
def ip_info():
ipadd = os.popen("ip a| grep ens192 | grep inet | awk '{print $2}'").read().strip()
return ipadd
# 获取根的占用信息
def disk_info_root():
child = subprocess.Popen(["df", "-h"], stdout=subprocess.PIPE)
out = child.stdout.readlines()
for item in out:
line = item.strip().split()
# 我这里只查看centos的根
if '/dev/mapper/centos-root' in line:
title = [u'-文件系统-', u'--容量-', u'-已用-', u'-可用-', u'-已用-', u'-挂载点--']
content = "\t".join(title)
if eval(line[4][0:-1]) > 60:
line[0] = 'centos-root'
content += '\r\n' + '\t'.join(line)
return content
# 测试程序
# if __name__ == "__main__":
# disk_information = disk_info()
# disk_usage = [int(s) for s in re.findall(r'\b\d+\b', disk_information)]
# infomation = [hostname_info(), time_info(), disk_information]
# print(disk_usage)
# # 如果磁盘占用高于60%就发邮件告警
# if disk_usage[0] > 60:
# print("当前磁盘占用率已超过60%,建议清除磁盘内存!")
#
# # print(hostname_info())
# # print(time_info())
# # print(ip_info())
# print(sys_info())
# print(cpu_info())
# print(mem_info())
# print(disk_info())
......@@ -3,7 +3,7 @@
# @Author : ctt
# @File : 文本内容提取
# @Project : untitled1
import re
import re, os
from docx import Document
import pandas as pd
......@@ -40,7 +40,7 @@ class Extract:
# {“主要职能”:””, “机构情况”:””, “人员情况”:””, “当年取得的主要事业成效”}
def __init__(self):
# self.main_functions = re.compile(r'(?<=[0-9][\..]主要职能[。\n])(.|\n)*?(?=[0-9][\..]机构情况[。\n])')
self.main_functions = re.compile(r'(?<=[0-9][\..]主要职能[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])')
self.main_functions = re.compile(r'(?<=([一二三四五六七八九十])基本情况[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])')
# self.institutional_situation = re.compile(r'(?<=[0-9][\..]机构情况[。\n])(.|\n)*?(?=[0-9][\..]人员情况[。\n])')
# self.personnel_situation = re.compile(r'(?<=[0-9][\..]人员情况[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])')
self.business_results = re.compile(r'(?<=([一二三四五六七八九十])当年取得的主要事业成效[。\n])(.|\n)*?(?=[一二三四五六七八九十]、收入支出预算执行情况分析)')
......@@ -74,12 +74,16 @@ def get_text_from_docx(filepath):
contents = []
for paragraph in document.paragraphs:
if '<w:numPr>' in paragraph._element.xml:
contents.append('1.'+paragraph.text)
# print(paragraph.text)
contents.append('1.'+paragraph.text.replace("\xa0", ""))
contents.append('\n')
else:
contents.append(paragraph.text)
# print(paragraph.text)
contents.append(paragraph.text.replace("\xa0", ""))
contents.append('\n')
return ''.join(contents)
str_contents = ''.join(contents)
# return ''.join(contents)
return str_contents
def get_cover_content_from_docx(filepath):
......@@ -116,26 +120,10 @@ def get_cover_content_from_docx(filepath):
if __name__ == '__main__':
new_path = "data/2022年度安岳县元坝镇人民政府部门决算分析报告(1).docx"
document = get_text_from_docx(new_path)
data = Extract().extract_result(document)
print(data)
# fifth_area_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明[。\n])(.|\n)*?(?=[0-9][\..]需要说明的其他事项[。\n])')
# filepath = "wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx"
# document = Document(filepath)
# documents = get_text_from_docx(filepath)
#
# area_group = fifth_area_pattern.search(documents)
# if area_group:
# area_text = area_group.group().strip("1.").strip()
# else:
# area_text = ""
#
# print(area_text)
# filepath = "data/wKjIbGRUpsaATABTAA5_0ejDDaQ144.docx"
# cover_contents, other_contents = get_cover_content_from_docx(filepath)
# cover_pattern = re.compile(r"([0-9]{0,4}).*(?=(财务报告))")
#
# # print(content)
# #
# cover_group = cover_pattern.search(cover_contents)
# if cover_group:
# cover_text = cover_group.group().strip()
......@@ -147,13 +135,13 @@ if __name__ == '__main__':
# other_data["reportTitle"] = cover_text
# print(other_data)
extract = Extract()
document = get_text_from_docx("data/2022年度安岳县元坝镇人民政府部门决算分析报告(1).docx")
data = Extract().extract_result(document)
print(data)
# path = r'D:\四川报告\相关代码\四川报告之文本内容提取\data'
# extract = Extract()
# # path = r'D:\四川报告\相关代码\四川报告之文本内容提取\data'
# path = "data/temp.docx"
# result = extract.extract_result(path)
# print(result)
# for file in os.listdir(path):
# if file[-4:] == 'docx':
# filepath = os.path.join(path, file)
......
......@@ -12,6 +12,7 @@ from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph
from docx.shared import Pt
def iter_block_items(parent):
......@@ -161,60 +162,137 @@ def get_other1_table(document, table_names: list):
return table_names_data
def read_document(document, old, new):
# 遍历表格
for table in document.tables:
for row in table.rows:
for cell in row.cells:
# 遍历表格段落内容,回到上个步骤,将cell当作paragraph处理
for paragraph in cell.paragraphs:
for run in paragraph.runs:
# 替换功能
if old in cell.text:
run.text = run.text.replace(old, new)
return document
def replace_document(document):
# 循环遍历所有段落
temp_list_0, temp_list_1 = [], []
for para in document.paragraphs:
# 使用replace()函数替换垂直制表符
if ("主要职能" in para.text and "机构情况" in para.text) or ("主要职能" in para.text and "&&&" in para.text):
temp_list_0 = para.text.split("&&&")
elif "&&&" in para.text:
temp_list_1 = para.text.split("&&&")
print(temp_list_0)
print(temp_list_1)
# 依次在第一部分添加目标内容
for temp in temp_list_0:
if temp:
for i, p in enumerate(document.paragraphs):
if ("主要职能" in p.text and "机构情况" in p.text) or ("主要职能" in p.text and "&&&" in p.text):
target_para = document.paragraphs[i]
new_para = target_para.insert_paragraph_before(temp)
# todo: 添加段落样式
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
# new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
new_para.paragraph_format.line_spacing = Pt(23.4)
# 置换原来段落内容为空
for p in document.paragraphs:
if ("主要职能" in p.text and "机构情况" in p.text) or ("主要职能" in p.text and "&&&" in p.text):
p.text = p.text.replace(p.text, "")
document._element.body.remove(p._element)
# 依次在第二部分添加目标内容
for temp in temp_list_1:
if temp:
for i, p in enumerate(document.paragraphs):
if "&&&" in p.text:
target_para = document.paragraphs[i]
new_para = target_para.insert_paragraph_before(temp)
# todo: 添加段落样式
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
# new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
new_para.paragraph_format.line_spacing = Pt(23.4)
# 置换原来段落内容为空
for p in document.paragraphs:
if "&&&" in p.text:
p.text = p.text.replace(p.text, "")
document._element.body.remove(p._element)
# 检查并置换目标段落为空
for para in document.paragraphs:
if "{{info.amountDescription}}" in para.text:
para.text = para.text.replace(para.text, "")
document._element.body.remove(para._element)
# document.save("data/test.docx")
return document
if __name__ == '__main__':
docx_file = r'data/3月23测试半成品.docx'
docx_file = r'data/20230512204902_5月10号财务报告测试.docx'
document = Document(docx_file)
table_names = ['货币资金明细信息如下']
print(get_other1_table(document, table_names))
# import datetime
# start_time = datetime.datetime.now()
# docx_file = r'data/四川报告模板.docx'
# document = Document(docx_file)
# data = get_choose_table(document, ['资产负债表', '收入费用表(1)', '收入费用表(2)'])
# # 处理资产负债表
# temp_list = data["资产负债表"]
# temp_dict = {}
replace_document(document)
# 循环遍历所有段落
# temp_list_0, temp_list_1 = [], []
# for para in document.paragraphs:
# # 使用replace()函数替换垂直制表符
# if "主要职能&&&" in para.text:
# temp_list_0 = para.text.split("&&&")
# elif "&&&" in para.text:
# temp_list_1 = para.text.split("&&&")
#
# for temp in temp_list:
# temp_text = re.sub(":", ":", temp["项目"])
# if temp_text.endswith(":"):
# temp_dict.update({"temp_key": temp_text})
# continue
# else:
# temp["上级项目"] = temp_dict["temp_key"].strip(":")
# for temp in temp_list_0:
# if temp:
# for i, p in enumerate(document.paragraphs):
# if "主要职能&&&" in p.text:
# target_para = document.paragraphs[i]
# new_para = target_para.insert_paragraph_before(temp)
# # todo: 添加段落样式
# font = new_para.runs[0].font
# font.name = "宋体"
# font.size = Pt(12)
# # new_para.paragraph_format.space_before = Pt(12)
# new_para.paragraph_format.first_line_indent = Pt(25)
# new_para.paragraph_format.line_spacing = Pt(23.4)
#
# for p in document.paragraphs:
# if "主要职能&&&" in p.text:
# p.text = p.text.replace(p.text, "")
# document._element.body.remove(p._element)
#
# # 处理收入费用表(1)
# temp_list_0 = data["收入费用表(1)"]
# temp_dict_0 = {"temp_key": "收入合计"}
# # updata_list = ["收入合计", "本年盈余"]
# for temp_0 in temp_list_0:
# if temp_0["项目"].strip() == "收入合计":
# temp_dict_0.update({"temp_key": "本年盈余"})
# else:
# if temp_0["项目"].strip() == "本年盈余":
# continue
# else:
# temp_0["上级项目"] = temp_dict_0["temp_key"]
# for temp in temp_list_1:
# if temp:
# for i, p in enumerate(document.paragraphs):
# if "&&&" in p.text:
# target_para = document.paragraphs[i]
# new_para = target_para.insert_paragraph_before(temp)
# # todo: 添加段落样式
# font = new_para.runs[0].font
# font.name = "宋体"
# font.size = Pt(12)
# # new_para.paragraph_format.space_before = Pt(12)
# new_para.paragraph_format.first_line_indent = Pt(25)
# new_para.paragraph_format.line_spacing = Pt(23.4)
#
# # 处理收入费用表(2)
# temp_list_1 = data["收入费用表(2)"]
# temp_dict_1 = {"temp_key": "收入合计"}
# # updata_list = ["收入合计", "本年盈余"]
# for temp_1 in temp_list_1:
# if temp_1["项目"].strip() == "收入合计":
# temp_dict_1.update({"temp_key": "本年盈余"})
# else:
# if temp_1["项目"].strip() == "本年盈余":
# continue
# else:
# temp_1["上级项目"] = temp_dict_1["temp_key"]
# print(data)
# end_time = datetime.datetime.now()
# print(start_time)
# print(end_time)
# print("耗时: {}秒".format(end_time - start_time))
# for p in document.paragraphs:
# if "&&&" in p.text:
# p.text = p.text.replace(p.text, "")
# document._element.body.remove(p._element)
#
# for para in document.paragraphs:
# if "{{info.amountDescription}}" in para.text:
# para.text = para.text.replace(para.text, "")
# document._element.body.remove(para._element)
document.save("data/test.docx")
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -11,14 +11,14 @@ from pyecharts.charts import Pie, Bar, Line, Grid
from pyecharts.faker import Faker
from pyecharts.render import make_snapshot # 导入输出图片工具
from snapshot_selenium import snapshot # 使用snapshot-selenium 渲染图片
from pyecharts.globals import CurrentConfig, ThemeType
from pyecharts.globals import CurrentConfig
from pathlib import Path
# import time
from unittest import mock
from base.config.base_config import root_dir
from utils.tools import timeit
# root_dir = '..'
import threading
# 解决linux 下图片生成失败问题
......@@ -32,16 +32,6 @@ def get_chrome_driver():
return webdriver.Chrome(options=options)
# def timeit(f):
# def timed(*args, **kw):
# ts = time.time()
# print('......begin {0:8s}......'.format(f.__name__))
# result = f(*args, **kw)
# te = time.time()
# print('......finish {0:8s}, took:{1:.4f} sec......'.format(f.__name__, te - ts))
# return result
#
# return timed
"""
关于: [图片生成的中文字体样式渲染问题]
......@@ -57,40 +47,40 @@ CurrentConfig.ONLINE_HOST = 'http://39.105.62.235:8000/assets/'
pic_echarts_dir = os.path.join(root_dir, 'generate/echarts')
Path(pic_echarts_dir).mkdir(parents=True, exist_ok=True)
lock = threading.RLock()
@timeit
def pic_echarts_pie(keys: list, values: list, title: str or None) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_pie.png')
pie = (
Pie().add(
series_name='',
data_pair=[list(z) for z in zip(keys, values)],
center=['45%', '50%'],
is_clockwise=False
).set_global_opts(
title_opts=opts.TitleOpts(title=title),
legend_opts=opts.LegendOpts(type_='scroll', pos_left='80%', orient='vertical', textstyle_opts=opts.TextStyleOpts(font_size=20))
).set_series_opts(
# label_opts=opts.LabelOpts(formatter='{b}: {c}({d}%)')
label_opts=opts.LabelOpts(formatter="{b}: {d}%", font_size=20)
def pic_echarts_pie(keys: list, values: list, title: str or None, pic_echarts_path: str) -> str:
# pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_pie.png')
# pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
with lock:
pie = (
Pie().add(
series_name='',
data_pair=[list(z) for z in zip(keys, values)],
center=['45%', '50%'],
is_clockwise=False
).set_global_opts(
title_opts=opts.TitleOpts(title=title),
legend_opts=opts.LegendOpts(type_='scroll', pos_left='80%', orient='vertical', textstyle_opts=opts.TextStyleOpts(font_size=20))
).set_series_opts(
label_opts=opts.LabelOpts(formatter="{b}: {d}%", font_size=18)
)
)
# # 设置标签字体大小
# .set_series_opts(label_opts=opts.LabelOpts(font_size=22))
)
with mock.patch('snapshot_selenium.snapshot.get_chrome_driver', get_chrome_driver):
make_snapshot(snapshot, pie.render(), pic_echarts_path)
# print("当前处理的数据集key{},和value{}".format(keys, values))
with lock:
with mock.patch('snapshot_selenium.snapshot.get_chrome_driver', get_chrome_driver):
make_snapshot(snapshot, pie.render(), pic_echarts_path)
# make_snapshot(snapshot, pie.render(), pic_echarts_path)
return pic_echarts_path
@timeit
def pic_echarts_bar(
keys: list, dict_values: dict, title=None,
x_name=None, y_name=None
keys: list, dict_values: dict, temp_file_name: str, title=None,
x_name=None, y_name=None,
) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_bar.png')
pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
bar = (
Bar().add_xaxis(
xaxis_data=keys
......@@ -113,10 +103,10 @@ def pic_echarts_bar(
@timeit
def pic_echarts_line(
keys: list, dict_values: dict, title=None,
keys: list, dict_values: dict, temp_file_name: str, title=None,
x_name=None, y_name=None
) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_line.png')
pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
line = (
Line().add_xaxis(
xaxis_data=keys
......@@ -193,6 +183,7 @@ def pic_echarts_line_test() -> None:
@timeit
def pic_echarts_bar_line(
temp_file_name: str,
keys=['2016年报', '2017年报', '2018年报', '2019年报', '2020年报', '2021年报'],
dict_bar_values={
'总资产': [1905.11, 1998.17, 2009.65, 2031.37, 1950.35, 1988.65],
......@@ -205,7 +196,7 @@ def pic_echarts_bar_line(
title='资产负债表(CNY)',
x_name='年度', y_name_left='金额/(亿元)', y_name_right='负债率/(%)'
) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_bar_line.png')
pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
bar = (
Bar().add_xaxis(
xaxis_data=keys
......@@ -342,7 +333,8 @@ def pic_echarts_bar_line_test() -> str:
if __name__ == '__main__':
pic_echarts_pie(keys=Faker.choose(), values=Faker.values(), title='Echarts Pie 标题1')
# pic_echarts_pie(keys=Faker.choose(), values=Faker.values(), title='Echarts Pie 标题1')
# pic_echarts_bar(
# keys=Faker.choose(),
# dict_values={
......@@ -363,4 +355,4 @@ if __name__ == '__main__':
# y_name='Y轴名称'
# )
# pic_echarts_line_test()
# pic_echarts_bar_line()
pic_echarts_bar_line()
......@@ -11,7 +11,7 @@ from flask import request
from flask import Flask, send_file
# from transform_doc_to_docx import doc2docx, closesoft
import subprocess
from generate.gen_user_report_auto_generated import main_process
from generate.gen_user_report_auto_generated import new_main_process
UPLOAD_FOLDER = r'data' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
abs_path = os.path.split(os.path.realpath(__file__))[0]
......@@ -64,7 +64,7 @@ def generate_report(template_path, document_path, report_name, object):
half_work_path = os.path.join(UPLOAD_FOLDER, template_filename)
else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
main_process(half_work_path, tables_dict, template_path, report_name, data_object, save_path)
# main_process(half_work_path, tables_dict, template_path, report_name, data_object, save_path)
# send_path = os.path.join(UPLOAD_FOLDER, report_name)
# return send_file(send_path, as_attachment=True)
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : main_server.py
# @Time : 2023/5/15 15:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import requests
import threading
import sys
import json
from pathlib import Path
sys.path.append('../')
# 关闭多余连接
s = requests.session()
s.keep_alive = False
from detector_source import sys_info, cpu_info, mem_info
# 文件上传服务器定义
from fdfs_client.client import *
tracker_conf = get_tracker_conf("data/fdfs_client.conf")
client = Fdfs_client(tracker_conf)
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# 创建一个filehandler,用于将错误日志输出到文件,并设置日志级别
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(os.path.join(_tmp_path, "main_server_error.log"))
fh1 = logging.FileHandler(os.path.join(_tmp_path, "main_server_info.log"))
fh.setLevel(level=logging.ERROR)
fh1.setLevel(level=logging.INFO)
fh.setFormatter(formatter)
fh1.setFormatter(formatter)
# 同时将日志输出到控制台和文件
logger.addHandler(ch)
logger.addHandler(fh)
logger.addHandler(fh1)
# TODO: 定义进程存放列表
all_thread = []
"""
测试地址:http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus
正式地址:http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus
"""
# callback_url = "http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus"
callback_url = "http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus"
# todo: 合并段落和句子去重处理
def main_process(half_document_path, tables_dict, template_path, data_object, report_id,
output_report_path, report_name, template_id):
# 输入半成品、模板和必要数据,对模板进行更新,输出更新后的模板路径
try:
from generate.gen_user_report_auto_generated import new_main_process
from docx import Document
half_document = Document(half_document_path)
template_document = Document(template_path)
report_processed_path = new_main_process(half_document=half_document,
tables_dict=tables_dict,
template_document=template_document,
data_object=data_object,
report_id=report_id,
output_report_path=output_report_path)
# todo: 增加去重表格中的圆圈【空格】
report_processed = Document(report_processed_path)
from extract_table import read_document, replace_document
report_processed = read_document(report_processed, "\u00A0", "")
# todo: 增加缺失值项替换为空
report_processed = replace_document(report_processed)
final_report_name = '{}_{}.docx'.format(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')), report_name)
UPLOAD_FOLDER = r'data/' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
send_path = os.path.join(UPLOAD_FOLDER, final_report_name)
report_processed.save(send_path)
ret_upload = client.upload_by_filename(send_path)
logger.info(ret_upload["Remote file_id"])
# b'group1/M00/00/0A/wKjIlGRjcHiAVnTuAAEo5wnGJLQ89.docx'
str_ret_upload = str(ret_upload["Remote file_id"])
filePath = str_ret_upload.strip('b').replace("'", "").strip()
logger.info(filePath)
dict_result = {
"status": "0", # 处理状态 ("0" 成功, "1" 失败)
"result": "处理成功",
"id": report_id, # 报告id
"templeteId": template_id, # 报告模板
"filePath": filePath
}
logger.info(dict_result)
# todo: 调用java的状态更新接口返回异常的结果
payload = json.dumps(dict_result)
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=callback_url,
headers=headers, data=payload)
logger.info(r1.text)
except Exception as e:
dict_result = {
"status": "1", # 处理状态 (0 成功, 1 失败)
"result": "生成失败!+ {}".format(str(e)),
"id": report_id, # 报告id
"templeteId": template_id, # 报告模板
"filePath": ""
}
# todo: 调用java的状态更新接口返回异常的结果
payload = json.dumps(dict_result)
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=callback_url,
headers=headers, data=payload)
logger.info(r1.text)
return dict_result
def env_eval():
# todo 获取资源相关信息(磁盘占用率、系统占用信息【超过3个为高】、CPU占用率、物理内存占用率)
# disk_usage = disk_info()
sys_usage = sys_info()
cpu_usage = cpu_info()
men_usage = mem_info()
# todo 资源不够用时,返回 False
if sys_usage > 20 or cpu_usage > str(95) or men_usage > str(95):
return False
# todo 资源够用时,返回 True
return True
def system_start():
while True:
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
r1_json = json.loads(r1.text)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
time.sleep(6)
else:
# todo: 若任务队列不为空进行报告处理
for i in range(queue_left_number):
r2 = requests.post(url='http://localhost:4000/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
report_id = config_info["report_id"]
template_id = config_info["template_id"]
half_document_path = config_info["half_document_path"]
tables_dict = config_info["tables_dict"]
template_path = config_info["template_path"]
data_object = config_info["data_object"]
output_report_path = config_info["output_report_path"]
report_name = config_info["report_name"]
logger.info('##########报告生成服务###############')
t = threading.Thread(target=main_process, args=(half_document_path, tables_dict, template_path,
data_object, report_id, output_report_path,
report_name, template_id),
daemon=True)
while True:
if env_eval():
break
else:
time.sleep(6)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动服务
r1 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url='http://localhost:4000/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行数据去重入库服务 ...')
break
else:
logger.info('队列为空!可放心进行数据去重入库服务 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
# todo: 如果服务启动,即接口访问正常,则返回“真”
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError" + str(e))
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!报告生成服务已启动...")
break
if __name__ == '__main__':
# 开始启动后台处理服务
start_up_check()
logger.info('报告生成服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('报告生成服务恢复完成!')
logger.info('报告生成服务运行中 ...')
system_start()
# -*- coding: utf-8 -*-
# @Time : 2023/3/7 17:29
# @Author : ctt
# @File : copy_table
# @Project : 表格复制
from copy import deepcopy
from docx import Document
"""
prep_p = p.insert_paragraph_before("段落前插入内容)
document.add_page_break() # 插入分页符
"""
import re
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph
from docx.shared import Pt
from copy_table import get_choose_table, generate_report
from generate.gen_user_report_auto_generated import new_GeneralUserReportAutoGenerated
import os, datetime
doc_path = "data/2022年度德阳市旌阳区人民法院(1).docx"
temp_path = "data/财务报告模板(5).docx"
tables_dict = {
"table5": "收入费用表(2)",
"table4": "收入费用表(1)",
"table3": "资产负债表续表2",
"table2": "资产负债表续表1",
"table1": "资产负债表",
}
data_object = {
"finance": {
"publicInfrastructureRatio": "",
"affordabelHouseNewRatio": "",
"assetLiabilityRatio": "24.31%",
"assetLiabilityRatioRemark": "说明本单位财务风险低",
"beInDebt": "264.31",
"beInDebtChangeRatio": "24.88%",
"beInDebtChangeRatioRemark": "主要原因是本年的其他应付款的减少",
"cashRatio": "248.35%",
"cashRatioRemark": "说明本单位利用现金和现金等价物偿还短期债务的能力强",
"composition": "流动资产占100.00%",
"currentAssetsCompose": "货币资金",
"currentLiabilitiesCompose": "其他应付款",
"currentRatio": "248.35%",
"currentRatioRemark": "说明本单位流动资产偿还短期债务的能力强",
"debtComparisonRatio": "减少87.56万元,减少24.88%",
"debtComposition": "流动负债占100.00%",
"fixedAssetsDepreciationRatio": "77.75%",
"fixedAssetsDepreciationRatioRemark": "说明本单位固定资产持续服务能力较强",
"netAssets": "823.10",
"nonCurrentAssetsCompose": "固定资产原值、固定资产净值、无形资产原价和无形资产净值",
"otherRemark": "资产总额较上年增减变动幅度超过20%主要原因是货币资金减少265.35万元;负债总额较上年增减变动幅度超过20%主要原因是其他应付款减少87.56万元",
"revenueExpensesRatio": "100.00%",
"revenueExpensesRatioRemark": "小于",
"surplus": "",
"totalAssets": "1087.40",
"totalAssetsChangeRatio": "20.53%",
"totalAssetsChangeRatioRemark": "主要原因是本年的货币资金的减少",
"totalAssetsComparison": "减少280.84万元,减少20.53%",
"totalExpenses": "1890.01",
"totalExpensesChangeRatio": "0.20%",
"totalExpensesChangeRatioRemark": "主要原因是本年的业务活动费用的减少",
"totalExpensesComparison": "减少3.77万元,减少0.20%",
"totalExpensesCompose": "业务活动费用",
"totalRevenue": "1890.01",
"totalRevenueChangeRatio": "15.49%",
"totalRevenueChangeRatioRemark": "主要原因是本年的财政拨款收入的减少",
"totalRevenueComparison": "减少346.51万元,减少15.49%",
"totalRevenueCompose": "财政拨款收入和其他收入",
"totalRevenueComposeDetail": "财政拨款收入占比99.88%、其他收入占比0.12%",
"unitAssetComposition": "流动资产",
"unitDebtComposition": "流动负债"
},
"收入占比": [
{
"beforeDataValue": "1229.05",
"dataValue": "1887.83",
"indexName": "财政拨款收入",
"subtractValue": "658.78"
},
{
"beforeDataValue": "5.42",
"dataValue": "2.19",
"indexName": "其他收入",
"subtractValue": "3.23"
}
],
"流动负债占比": [
{
"beforeDataValue": "351.87",
"dataValue": "264.31",
"indexName": "其他应付款",
"subtractValue": "87.56"
}
],
"流动资产占比": [
{
"beforeDataValue": "921.76",
"dataValue": "656.41",
"indexName": "货币资金",
"subtractValue": "265.35"
}
],
"负债占比": [
{
"beforeDataValue": "351.87",
"dataValue": "264.31",
"indexName": "流动负债合计"
}
],
"费用占比": [
{
"beforeDataValue": "1893.72",
"dataValue": "1890.01",
"indexName": "业务活动费用",
"subtractValue": "3.71"
}
],
"资产占比": [
{
"beforeDataValue": "921.76",
"dataValue": "656.41",
"indexName": "流动资产合计"
},
{
"beforeDataValue": "446.48",
"dataValue": "431.00",
"indexName": "非流动资产合计"
}
],
"非流动资产占比": [
{
"beforeDataValue": "554.31",
"dataValue": "554.31",
"indexName": "固定资产原值",
"subtractValue": "0.00"
},
{
"beforeDataValue": "446.48",
"dataValue": "431.00",
"indexName": "固定资产净值",
"subtractValue": "15.48"
},
{
"beforeDataValue": "0.00",
"dataValue": "0.00",
"indexName": "无形资产原价",
"subtractValue": "0.00"
},
{
"beforeDataValue": "0.00",
"dataValue": "0.00",
"indexName": "无形资产净值",
"subtractValue": "0.00"
}
],
"info": {
"internalControl": "2021年,本单位加强学习国家和省关于内部控制的文件。建立健全了单位层面的内部控制体系和制度,健全了预算、收支、采购、建设、资产和合同的内控流程和制度,把内部控制落实在业务流程中,实现了不相容岗位相互分离、形成相互制约、相互监督的工作机制;实现了内部授权审批控制。",
"unitName": "安岳县元坝镇人民政府",
"unitCall": "本部门",
"mainFunctions": "无资料数据",
"year": "2021",
"amountDescription": "本单位无以名义金额计量的资产。",
"unitBudgetLevel": "二级预算单位",
"institutionalSituation": "无资料数据",
"performanceManagement": "2021年,本单位按照绩效管理要求对照设定预算绩效目标、绩效指标的成本指标、产出指标、效益指标、满意度指标等具体内容,开展项目绩效目标申报、运行监控和自评工作。通过预算绩效管理对工作中存在的薄弱环节作出针对性查漏补缺和持续完善。",
"LastYear": "2020",
"personnelSituation": "无资料数据",
"unitType": "行政单位",
"budgetManagement": "2021年,本单位严格按照《预算法》、《会计法》、《政府会计制度》和上级的文件建立健全财务制度;严格执行财经纪律和各项财务制度;强化预算管理,加强对银行存款和现金的管理;单位对年终决算高度重视,组织专人负责编制决算报告,对决算数据进行了严格审核,认真分析并应用到下年的预算工作。",
"assetManagement": "2021年,本单位资产实行分类管理,建立健全了资产内部管理制度;单位加强对实物资产和无形资产的管理,明确相关部门和岗位的职责权限,强化对配置、使用和处置等关键环节的管控;明确资产使用和保管责任人,落实资产使用人在资产管理中的责任。",
"pppProject": "本单位无PPP项目。",
"careerAchievements": "无资料数据"
}
}
# 定义待复制内容的匹配模式
start_pattern1 = re.compile(r'(?<=导 言)$')
end_pattern1 = re.compile(r'(?<=[0-9][\..]资产负债情况)$|(?<=[0-9][\..]资产负债情况。)$')
start_pattern2 = re.compile(r'(?<=[0-9][\..]资产负债情况)$|(?<=[0-9][\..]资产负债情况。)$')
end_pattern2 = re.compile(r'(?<=[0-9][\..]收入费用情况)$|(?<=[0-9][\..]收入费用情况。)$')
start_pattern3 = re.compile(r'(?<=[0-9][\..]收入费用情况)$|(?<=[0-9][\..]收入费用情况。)$')
end_pattern3 = re.compile(r'(?<=[一二三四五六七八九十][\.、]政府部门财务报表)$')
# start_pattern4 = re.compile(r'^(?<=资产负债表)$')
# end_pattern4 = re.compile(r'(?<=表 2-1)$')
# start_pattern5 = re.compile(r'^(?<=收入费用表(1))$')
# end_pattern5 = re.compile(r'(?<=表 2-2)$')
# start_pattern6 = re.compile(r'^(?<=收入费用表(2))$')
# end_pattern6 = re.compile(r'(?<=([一二三四五六七八九十])政府部门会计报表附注)$')
start_pattern8 = re.compile(r'(?<=[0-9][\..]会计报表编制基础)$')
end_pattern8 = re.compile(r'(?<=[0-9][\..]遵循相关制度规定的声明)$')
start_pattern9 = end_pattern8
end_pattern9 = re.compile(r'(?<=[0-9][\..]合并范围)$')
start_pattern10 = end_pattern9
end_pattern10 = re.compile(r'(?<=[0-9][\..]重要会计政策与会计估计变更情况)$')
start_pattern11 = end_pattern10
end_pattern11 = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明)$')
start_pattern12 = end_pattern11
end_pattern12 = re.compile(r'(?<=[0-9][\..]需要说明的其他事项)$|(?<=[0-9][\..]需要说明的其他事项。)(略)$')
start_pattern13 = end_pattern12
end_pattern13 = re.compile(r'(?<=[一二三四五六七八九十][\.、]政府部门财务分析)$')
#
start_pattern14 = re.compile(r'(?<=[0-9][\..]基本情况)$')
end_pattern14 = re.compile(r'(?<=[0-9][\..]本年取得的主要事业成效)$')
start_pattern15 = end_pattern14
end_pattern15 = re.compile(r'(?<=([一二三四五六七八九十])政府部门财务状况分析)$')
#
start_pattern16 = end_pattern15
end_pattern16 = re.compile(r'(?<=[0-9][\..]单位资产构成及变化情况)$')
#
start_pattern17 = end_pattern16
end_pattern17 = re.compile(r'(?<=图一:资产占比)')
# end_pattern17 = re.compile(r'(?<={{ {"chart\.type": "pie", "chart\.dataset": "资产占比")')
# end_pattern17 = re.compile(r'(?<=[0-9][\..]单位负债构成及变化情况)$')
#
start_pattern18 = re.compile(r'(?<=[0-9][\..]单位负债构成及变化情况)$')
# end_pattern18 = re.compile(r'(?<=3.主要指标分析)$')
end_pattern18 = re.compile(r'(?<=图四:负债占比)')
#
start_pattern19 = re.compile(r'(?<=3.主要指标分析)$')
end_pattern19 = re.compile(r'(?<=([一二三四五六七八九十])政府部门运行情况分析)$')
#
start_pattern20 = end_pattern19
end_pattern20 = re.compile(r'(?<=[0-9][\..]单位收入费用构成及变化情况)$')
#
start_pattern21 = end_pattern20
# end_pattern21 = re.compile(r'(?<=2.主要指标分析)$')
end_pattern21 = re.compile(r'(?<=图六:收入占比)')
#
start_pattern22 = re.compile(r'(?<=2.主要指标分析)$')
end_pattern22 = re.compile(r'(?<=([一二三四五六七八九十])政府部门财务管理情况)$')
#
start_pattern23 = re.compile(r'(?<=[0-9][\..]预算管理情况)$')
end_pattern23 = re.compile(r'(?<=[0-9][\..]绩效管理情况)$')
start_pattern24 = end_pattern23
end_pattern24 = re.compile(r'(?<=[0-9][\..]内部控制情况)$')
start_pattern25 = end_pattern24
end_pattern25 = re.compile(r'(?<=[0-9][\..]资产管理情况)$')
start_pattern26 = end_pattern25
end_pattern26 = re.compile(r'.*[\s]]')
new_regions = [
(start_pattern1, end_pattern1),
(start_pattern2, end_pattern2),
(start_pattern3, end_pattern3),
# (start_pattern4, end_pattern4),
# (start_pattern5, end_pattern5),
# (start_pattern6, end_pattern6),
# (start_pattern7, end_pattern7),
(start_pattern8, end_pattern8),
(start_pattern9, end_pattern9),
(start_pattern10, end_pattern10),
(start_pattern11, end_pattern11),
(start_pattern12, end_pattern12),
(start_pattern13, end_pattern13),
(start_pattern14, end_pattern14),
(start_pattern15, end_pattern15),
(start_pattern16, end_pattern16),
(start_pattern17, end_pattern17),
(start_pattern18, end_pattern18),
(start_pattern19, end_pattern19),
(start_pattern20, end_pattern20),
(start_pattern21, end_pattern21),
(start_pattern22, end_pattern22),
(start_pattern23, end_pattern23),
(start_pattern24, end_pattern24),
(start_pattern25, end_pattern25),
(start_pattern26, end_pattern26),
]
regions = [
{'start': '导 言', 'end': '1.资产负债情况'},
{'start': '1.资产负债情况。', 'end': '2.收入费用情况。'},
{'start': '2.收入费用情况。', 'end': '一、政府部门财务报表'},
{'start': '资产负债表', 'end': '表 2-1'},
{'start': '收入费用表(1)', 'end': '表 2-2'},
{'start': '收入费用表(2)', 'end': '(二)政府部门会计报表附注'},
{'start': '1.会计报表编制基础', 'end': '2.遵循相关制度规定的声明'},
{'start': '2.遵循相关制度规定的声明', 'end': '3.合并范围'},
{'start': '3.合并范围', 'end': '4.重要会计政策与会计估计变更情况'},
{'start': '4.重要会计政策与会计估计变更情况', 'end': '5.会计报表重要项目的明细信息及说明'},
{'start': '5.会计报表重要项目的明细信息及说明', 'end': '6.需要说明的其他事项'},
{'start': '6.需要说明的其他事项', 'end': '二、政府部门财务分析'},
{'start': '1.基本情况', 'end': '2.本年取得的主要事业成效'},
{'start': '2.本年取得的主要事业成效', 'end': '(二)政府部门财务状况分析'},
{'start': '(二)政府部门财务状况分析', 'end': '1.单位资产构成及变化情况'},
{'start': '1.单位资产构成及变化情况', 'end': '2.单位负债构成及变化情况'},
{'start': '2.单位负债构成及变化情况', 'end': '3.主要指标分析'},
{'start': '3.主要指标分析', 'end': '(三)政府部门运行情况分析'},
{'start': '(三)政府部门运行情况分析', 'end': '1.单位收入费用构成及变化情况'},
{'start': '1.单位收入费用构成及变化情况', 'end': '2.主要指标分析'},
{'start': '2.主要指标分析', 'end': '(四)政府部门财务管理情况'},
{'start': '1.预算管理情况', 'end': '2.绩效管理情况'},
{'start': '2.绩效管理情况', 'end': '3.内部控制情况'},
{'start': '3.内部控制情况', 'end': '4.资产管理情况'},
# {'start': '4.资产管理情况', 'end': source_doc.Content.End},
]
def iter_block_items(parent):
"""
Generate a reference to each paragraph and table child within *parent*,
in document order. Each returned value is an instance of either Table or
Paragraph. *parent* would most commonly be a reference to a main
Document object, but also works for a _Cell object, which itself can
contain paragraphs and tables.
"""
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
elif isinstance(parent, _Row):
parent_elm = parent._tr
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
# todo: 先复制内容到模板中,保存更新后的模板
def copy_content_main(doc_path: str, temp_path: str, start_pattern, end_pattern):
doc = Document(doc_path)
# 新建临时文档
new_doc = Document()
start_found = False
end_found = False
for element in doc.element.body.xpath("w:p | w:tbl"):
if isinstance(element, CT_P):
para = Paragraph(element, doc)
start_results = re.findall(start_pattern, para.text)
if start_results:
# print(para.text)
start_found = True
continue
if isinstance(element, CT_P):
para = Paragraph(element, doc)
end_results = re.findall(end_pattern, para.text)
if end_results:
# print(para.text)
end_found = True
break
# 复制文本段落
if start_found and not end_found and isinstance(element, CT_P):
para = Paragraph(element, doc)
new_doc.add_paragraph(para.text)
# 复制表格
if start_found and not end_found and isinstance(element, CT_Tbl):
table = Table(element, doc)
new_table = deepcopy(table._element)
# 在目标文档添加一个空段落
new_doc.add_paragraph('')
# 获取新段落
new_paragraph = new_doc.paragraphs[-1]
# 在新段落中添加表格
new_paragraph._element.addprevious(new_table)
# 遍历文档中的段落,去除多余的空白段落
for para in new_doc.paragraphs:
# 使用正则表达式匹配空白段落(只包含空格和换行符)
if re.match(r'^\s*$', para.text):
# 删除空白段落
new_doc._element.body.remove(para._element)
# 获取待插入内容在目标文档中的位置
source_doc = Document(temp_path)
start_index = None
for index, para in enumerate(source_doc.paragraphs):
start_result = re.findall(start_pattern, para.text)
if start_result:
start_index = index
break
target_paragraph = source_doc.paragraphs[start_index]
# 遍历源文档中的所有元素
for element in reversed(new_doc.element.body):
# 如果是段落,就在目标段落之后添加
if isinstance(element, CT_P):
# 考虑样式发生变化,对此进行调整
para = Paragraph(element, doc)
# 设置字体和字号
new_para = source_doc.add_paragraph(para.text, style='Normal')
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
# new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
new_para.paragraph_format.line_spacing = Pt(23.4)
# para_format.line_spacing_rule = docx.enum.text.WD_LINE_SPACING.MULTIPLE
# para_format.line_spacing = docx.shared.Pt(1.5)
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1,
new_para._element)
# 如果是表格,也在目标段落之后添加
elif isinstance(element, CT_Tbl):
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1, element)
# source_doc.save(temp_path)
return source_doc
def send_regenerate_report(half_document, template_document, output_report_path, data_object, report_id):
# todo: step1:先复制基本的内容和表格信息
for start_pattern, end_pattern in new_regions:
copy_content_main(doc_path, temp_path, start_pattern, end_pattern)
# todo: step2: 提取特殊表格并复制到模板中,该处不会直接覆盖模板内容,新的中间报告会存放在save_path中
tables_dict = {
"table5": "收入费用表(2)",
"table4": "收入费用表(1)",
"table3": "资产负债表续表2",
"table2": "资产负债表续表1",
"table1": "资产负债表",
}
data_result = get_choose_table(half_document, list(tables_dict.values()))
template_document = generate_report(table_names_data=data_result,
template_document=template_document,
tables_dict=tables_dict)
# todo: step3:填充基本数据到模板中(此处为封面信息和图片信息)
project_name = str(report_id) + "final_四川报告"
gurag = new_GeneralUserReportAutoGenerated(
project_name=project_name,
template_document=template_document,
output_report_path=output_report_path,
start_time=datetime.datetime.now(), end_time=datetime.datetime.now()
)
gurag.process(data_result=data_object,
report_id=report_id)
return output_report_path
if __name__ == '__main__':
# send_regenerate_report(half_document, template_document, output_report_path, data_object, report_id)
# for start_pattern, end_pattern in new_regions:
# copy_content_main(doc_path, temp_path, start_pattern, end_pattern)
#
# document = Document(doc_path)
# data_result = get_choose_table(document, list(tables_dict.values()))
# save_path = "data/temp_module.docx"
# generate_report(data_result, save_path=save_path, template_path=temp_path,
# tables_dict=tables_dict)
#
# # todo: 步骤4——填充基本数据到模板中
# project_name = "四川报告"
# report_name = "5月7号财务报告测试.docx"
data_object = {
"finance": {
"publicInfrastructureRatio": "",
"affordabelHouseNewRatio": "",
"assetLiabilityRatio": "24.31%",
"assetLiabilityRatioRemark": "说明本单位财务风险低",
"beInDebt": "264.31",
"beInDebtChangeRatio": "24.88%",
"beInDebtChangeRatioRemark": "主要原因是本年的其他应付款的减少",
"cashRatio": "248.35%",
"cashRatioRemark": "说明本单位利用现金和现金等价物偿还短期债务的能力强",
"composition": "流动资产占100.00%",
"currentAssetsCompose": "货币资金",
"currentLiabilitiesCompose": "其他应付款",
"currentRatio": "248.35%",
"currentRatioRemark": "说明本单位流动资产偿还短期债务的能力强",
"debtComparisonRatio": "减少87.56万元,减少24.88%",
"debtComposition": "流动负债占100.00%",
"fixedAssetsDepreciationRatio": "77.75%",
"fixedAssetsDepreciationRatioRemark": "说明本单位固定资产持续服务能力较强",
"netAssets": "823.10",
"nonCurrentAssetsCompose": "固定资产原值、固定资产净值、无形资产原价和无形资产净值",
"otherRemark": "资产总额较上年增减变动幅度超过20%主要原因是货币资金减少265.35万元;负债总额较上年增减变动幅度超过20%主要原因是其他应付款减少87.56万元",
"revenueExpensesRatio": "100.00%",
"revenueExpensesRatioRemark": "小于",
"surplus": "",
"totalAssets": "1087.40",
"totalAssetsChangeRatio": "20.53%",
"totalAssetsChangeRatioRemark": "主要原因是本年的货币资金的减少",
"totalAssetsComparison": "减少280.84万元,减少20.53%",
"totalExpenses": "1890.01",
"totalExpensesChangeRatio": "0.20%",
"totalExpensesChangeRatioRemark": "主要原因是本年的业务活动费用的减少",
"totalExpensesComparison": "减少3.77万元,减少0.20%",
"totalExpensesCompose": "业务活动费用",
"totalRevenue": "1890.01",
"totalRevenueChangeRatio": "15.49%",
"totalRevenueChangeRatioRemark": "主要原因是本年的财政拨款收入的减少",
"totalRevenueComparison": "减少346.51万元,减少15.49%",
"totalRevenueCompose": "财政拨款收入和其他收入",
"totalRevenueComposeDetail": "财政拨款收入占比99.88%、其他收入占比0.12%",
"unitAssetComposition": "流动资产",
"unitDebtComposition": "流动负债"
},
"收入占比": [
{
"beforeDataValue": "1229.05",
"dataValue": "1887.83",
"indexName": "财政拨款收入",
"subtractValue": "658.78"
},
{
"beforeDataValue": "5.42",
"dataValue": "2.19",
"indexName": "其他收入",
"subtractValue": "3.23"
}
],
"流动负债占比": [
{
"beforeDataValue": "351.87",
"dataValue": "264.31",
"indexName": "其他应付款",
"subtractValue": "87.56"
}
],
"流动资产占比": [
{
"beforeDataValue": "921.76",
"dataValue": "656.41",
"indexName": "货币资金",
"subtractValue": "265.35"
}
],
"负债占比": [
{
"beforeDataValue": "351.87",
"dataValue": "264.31",
"indexName": "流动负债合计"
}
],
"费用占比": [
{
"beforeDataValue": "1893.72",
"dataValue": "1890.01",
"indexName": "业务活动费用",
"subtractValue": "3.71"
}
],
"资产占比": [
{
"beforeDataValue": "921.76",
"dataValue": "656.41",
"indexName": "流动资产合计"
},
{
"beforeDataValue": "446.48",
"dataValue": "431.00",
"indexName": "非流动资产合计"
}
],
"非流动资产占比": [
{
"beforeDataValue": "554.31",
"dataValue": "554.31",
"indexName": "固定资产原值",
"subtractValue": "0.00"
},
{
"beforeDataValue": "446.48",
"dataValue": "431.00",
"indexName": "固定资产净值",
"subtractValue": "15.48"
},
{
"beforeDataValue": "0.00",
"dataValue": "0.00",
"indexName": "无形资产原价",
"subtractValue": "0.00"
},
{
"beforeDataValue": "0.00",
"dataValue": "0.00",
"indexName": "无形资产净值",
"subtractValue": "0.00"
}
],
"info": {
"internalControl": "2021年,本单位加强学习国家和省关于内部控制的文件。建立健全了单位层面的内部控制体系和制度,健全了预算、收支、采购、建设、资产和合同的内控流程和制度,把内部控制落实在业务流程中,实现了不相容岗位相互分离、形成相互制约、相互监督的工作机制;实现了内部授权审批控制。",
"unitName": "安岳县元坝镇人民政府",
"unitCall": "本部门",
"mainFunctions": "无资料数据",
"year": "2021",
"amountDescription": "本单位无以名义金额计量的资产。",
"unitBudgetLevel": "二级预算单位",
"institutionalSituation": "无资料数据",
"performanceManagement": "2021年,本单位按照绩效管理要求对照设定预算绩效目标、绩效指标的成本指标、产出指标、效益指标、满意度指标等具体内容,开展项目绩效目标申报、运行监控和自评工作。通过预算绩效管理对工作中存在的薄弱环节作出针对性查漏补缺和持续完善。",
"LastYear": "2020",
"personnelSituation": "无资料数据",
"unitType": "行政单位",
"budgetManagement": "2021年,本单位严格按照《预算法》、《会计法》、《政府会计制度》和上级的文件建立健全财务制度;严格执行财经纪律和各项财务制度;强化预算管理,加强对银行存款和现金的管理;单位对年终决算高度重视,组织专人负责编制决算报告,对决算数据进行了严格审核,认真分析并应用到下年的预算工作。",
"assetManagement": "2021年,本单位资产实行分类管理,建立健全了资产内部管理制度;单位加强对实物资产和无形资产的管理,明确相关部门和岗位的职责权限,强化对配置、使用和处置等关键环节的管控;明确资产使用和保管责任人,落实资产使用人在资产管理中的责任。",
"pppProject": "本单位无PPP项目。",
"careerAchievements": "无资料数据"
}
}
# template_dir, template_name = os.path.split(save_path)
#
# gurag = new_GeneralUserReportAutoGenerated(
# project_name=project_name,
# input_template_path=save_path,
# output_report_path=os.path.join(template_dir, report_name),
# start_time=datetime.datetime.now(), end_time=datetime.datetime.now()
# )
# gurag.process(data_object)
#
......@@ -16,8 +16,8 @@ timeout = 300 # 超时 -> 目前为迎合ZZSN_NLP平
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12
workers = 3 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数
workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 10 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
......
......@@ -2,14 +2,20 @@
# description: auto_run
# 四川报告生成监控
# 检测脚本是否在运行,若已经在运行,则等待一段时间后再次检查,若未启动则进行启动
function start_interface() {
# echo "114.115.185.13 dfs" >>/etc/hosts
#echo "192.168.200.148 dfs" >>/etc/hosts # 测试环境
echo "192.168.1.75 dfs" >>/etc/hosts # 正式环境
start_interface() {
INTERFACE_IS_STRAT=`ps -ef | grep scbg_app_config.py | grep -v grep | wc -l`
if [ $INTERFACE_IS_STRAT -eq 4 ] ; then
usleep
if [ $INTERFACE_IS_STRAT -ne 0 ] ; then
sleep 30m
else
echo "=========Service Will Start=========="
# cd /data/lzc/scbg-python/SCBG-PYTHON && nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 &
cd /opt/SCBG-PYTHON && exec nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 &
cd /opt/SCBG-PYTHON
exec nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 &
sleep 5m
exec nohup python -u main_server.py >main_server.log 2>&1 &
echo "=========Service Start Completed!========"
fi
......@@ -22,4 +28,4 @@ do
echo "PYTHON SERVICE is running..."
start_interface
sleep 30m
done
\ No newline at end of file
done
......@@ -42,4 +42,4 @@ def doc2docx(path):
if __name__ == '__main__':
closesoft()
doc2docx(r'D:\四川报告\相关代码\从word中提取指定表格\data\特殊教育学校(1).doc')
doc2docx(r'D:/四川报告/相关代码/从word中提取指定表格/data/四川省化工作质量安全检测研究院2022年度部门决算分析报告(1).doc')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论