提交 0298d2cf 作者: bruxellse_li

5月份增加后台生成方式,并简化程序

上级 34cbd322
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="Flask">
<option name="enabled" value="true" />
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="33">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/SCBG-PYTHON.iml" filepath="$PROJECT_DIR$/.idea/SCBG-PYTHON.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
...@@ -27,6 +27,7 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list ...@@ -27,6 +27,7 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list
&& dpkg -i /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \ && dpkg -i /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \
&& apt-get install -f \ && apt-get install -f \
&& rm /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \ && rm /opt/SCBG-PYTHON/google-chrome-stable_current_amd64.deb \
# && ln -sf /usr/local/bin/python /usr/bin/python \
&& /usr/local/bin/python -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn \ && /usr/local/bin/python -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn \
&& pip install -r /opt/SCBG-PYTHON/requirements.txt -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com --no-cache-dir --default-timeout=10000 \ && pip install -r /opt/SCBG-PYTHON/requirements.txt -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com --no-cache-dir --default-timeout=10000 \
&& apt-get remove -y build-essential && apt-get clean \ && apt-get remove -y build-essential && apt-get clean \
...@@ -34,6 +35,12 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list ...@@ -34,6 +35,12 @@ RUN sed -i s@/archive.ubuntu.com/@/repo.huaweicloud.com/@g /etc/apt/sources.list
&& chmod u+x /opt/SCBG-PYTHON/start.sh && chmod u+x /opt/SCBG-PYTHON/start.sh
ENV HOME=/home/user
#ENV LANG=en_US.utf8
#ENV LC_ALL=en_US.utf8
#ENV PATH="/usr/local/bin/python:${PATH}"
EXPOSE 4000 EXPOSE 4000
WORKDIR '/opt/SCBG-PYTHON' WORKDIR '/opt/SCBG-PYTHON'
...@@ -41,6 +48,6 @@ WORKDIR '/opt/SCBG-PYTHON' ...@@ -41,6 +48,6 @@ WORKDIR '/opt/SCBG-PYTHON'
# CMD ["./start.sh"] # CMD ["./start.sh"]
# c——告诉shell 运行后续命令, 此处是执行shell脚本,并将输出重定向到指定文件中 # c——告诉shell 运行后续命令, 此处是执行shell脚本,并将输出重定向到指定文件中
CMD ["sh", "-c", "/bin/bash start.sh | tee /opt/SCBG-PYTHON/start.log"] CMD ["sh", "-c", "/bin/bash start.sh | tee /opt/SCBG-PYTHON/log/start.log"]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 文章内容检查.py
# @Time : 2022/12/9 16:05
# @Author : bruxelles_li
# @Software: PyCharm
from bs4 import BeautifulSoup
import re
punctuation = re.compile(r'[\n0-9a-zA-Z、!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、'
r'〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.!"#$%&\'()*+,\-'
r'./:;<=>?@\[\]\\^_`{|}~一二三四五六七八九十《]')
# punctuation = re.compile(r'[0-9]')
def is_punctuation(text):
punctuation_pattern = re.compile(punctuation)
if punctuation_pattern.sub('', text):
return False
else:
return True
# 程序返回处理函数
def clean_html_tag(content):
text = content
bs = BeautifulSoup(text, 'lxml')
temp = []
match_content = bs.text.strip()
# 初步清洗文中的空白符,杂乱符号
pattern = re.compile(
'[#$*$<=>@●▍[\]△▲^_`■▋{|}~⦅⦆ф「」\u3000〈〉《》「」『』【】※〔〕〖〗〘〙〚〛〜〰〾〿\*〈〉]')
match_content0 = pattern.sub('', match_content)
match_content1 = re.sub(r"(阅读提示|点击 上方文字 关注我们 |点击 上方文字 关注我们|点击蓝字丨关注我们|点击蓝字 关注我们|- THE END - |◀——|-)", "", match_content0)
match_content2 = re.sub(r"(?=\(图片[::]).+(?<=\))", "", match_content1)
match_content3 = re.sub(r"&mdash&mdash", "&mdash", match_content2)
match_content4 = re.sub(r"&mdash", "&", match_content3)
match_content5 = re.sub(r"       ", "", match_content4)
match_content6 = re.sub(r"(?=\().*(?<=图\))", "", match_content5)
match_content7 = re.sub(r'。"', "。”", match_content6)
match_content8 = re.sub(r"(。;|。,)", "。", match_content7)
match_content9 = re.sub(r"(\\t|\\)", "", match_content8)
list_content = match_content9.split('\n')
temp_content = []
for text in list_content:
if len(text) <= 2:
continue
else:
text = text.strip()
if text.endswith("。") or text.endswith("“") or text.endswith(".") or text.endswith('”'):
text = text
else:
text = text + "\t" + "\t"
text = re.sub(r".*(?<=记者).*(?<=摄)", "", text)
temp_content.append(text)
# print(temp_content)
str_content = "\t".join(temp_content)
a = re.sub('\t\t\t', '——', str_content)
a0 = re.sub('\t\t', '', a)
a1 = re.sub(r":——", ":", a0)
a2 = re.sub(r"。)", ")", a1)
a3 = re.sub(r"(。”|!”)", "”", a2)
b = re.sub("\t", "\n", a3).strip()
c = b.split('\n')
# print(len(c))
for d in c:
e = d.strip('&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;')
# 去除中间包含指定字符所在的句子
f = re.sub("(微信|如需转载|免责声明|公告|jpeg|jpg|png|声明:|附件:|责任单位:|编辑:).*?(?<=。)", '', str(e))
# 删除邮箱所在行
g = re.sub(".*(?=\.com|www\.).*", "", f)
# print(g)
if len(g) <= 20:
continue
else:
temp.append(g)
h = "\t".join(temp)
j = re.sub("\t(?=而|但|对于|此外|因此|与此同时|这种|基于此|但是|然而)", "", h)
new_content = re.sub("\t", "\n", j)
new_content_list = new_content.split("\n")
final_content_list = []
for k in new_content_list:
k = " " + k
# 先去除中间包含javascript、html所在的段落内容
l = re.sub(".*(function。|html|background|javascript|image).*", '', k)
if l:
final_content_list.append(l.strip("——"))
final_content = "\n\n".join(final_content_list) if len(final_content_list) >= 10 else "".join(final_content_list)
return final_content
if __name__ == "__main__":
text = """工业和信息化部◀——◀——◀——◀——◀—— 人力资源社会保障部 生态环境部 商务部 市场监管总局
持续健全市场化运营体制机制,守好安全生产底线红线,推进绿色低碳科技研发应用,为实现碳达峰碳中和目标贡献力量。把坚持党的领导加强党的建设融入公司治理,凝聚各方面工作合力,努力开创公司改革发展新局面。(图片:孚能科技将绿色发展融入企业成长,并带动产业链协同提昇晉昇,提拔)
"""
print(clean_html_tag(content=text))
...@@ -21,8 +21,8 @@ from docx.table import _Cell, Table, _Row ...@@ -21,8 +21,8 @@ from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.shared import Pt from docx.shared import Pt
# 定义待复制内容的匹配模式 # 定义待复制内容的匹配模式
start_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明)$') start_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明)$|(?<=[0-9][\..]会计报表重要项目的明细信息及说明。)$')
end_pattern = re.compile(r'(?<=[0-9][\..]需要说明的其他事项)$|(?<=[0-9][\..]需要说明的其他事项。)(略)$') end_pattern = re.compile(r'(?<=[0-9][\..]需要说明的其他事项)$|(?<=[0-9][\..]需要说明的其他事项。)(略)$|(?<=[0-9][\..]需要说明的其他事项[。\.])$')
def iter_block_items(parent): def iter_block_items(parent):
...@@ -129,10 +129,92 @@ def copy_content_main(doc_path: str, temp_path: str): ...@@ -129,10 +129,92 @@ def copy_content_main(doc_path: str, temp_path: str):
return None return None
# todo: 先复制内容到模板中,保存更新后的模板
def new_copy_content_main(doc_document, template_document):
doc = doc_document
# 新建临时文档
new_doc = Document()
start_found = False
end_found = False
for element in doc.element.body.xpath("w:p | w:tbl"):
if isinstance(element, CT_P):
para = Paragraph(element, doc)
start_results = re.findall(start_pattern, para.text)
if start_results:
# print(para.text)
start_found = True
continue
if isinstance(element, CT_P):
para = Paragraph(element, doc)
end_results = re.findall(end_pattern, para.text)
if end_results:
# print(para.text)
end_found = True
break
# 复制文本段落
if start_found and not end_found and isinstance(element, CT_P):
para = Paragraph(element, doc)
new_doc.add_paragraph(para.text)
# 复制表格
if start_found and not end_found and isinstance(element, CT_Tbl):
table = Table(element, doc)
new_table = deepcopy(table._element)
# 在目标文档添加一个空段落
new_doc.add_paragraph('')
# 获取新段落
new_paragraph = new_doc.paragraphs[-1]
# 在新段落中添加表格
new_paragraph._element.addprevious(new_table)
# 遍历文档中的段落,去除多余的空白段落
for para in new_doc.paragraphs:
# 使用正则表达式匹配空白段落(只包含空格和换行符)
if re.match(r'^\s*$', para.text):
# 删除空白段落
new_doc._element.body.remove(para._element)
# 获取待插入内容在目标文档中的位置
# source_doc = Document(temp_path)
source_doc = template_document
start_index = None
for index, para in enumerate(source_doc.paragraphs):
start_result = re.findall(start_pattern, para.text)
if start_result:
start_index = index
break
target_paragraph = source_doc.paragraphs[start_index]
# 遍历源文档中的所有元素
for element in reversed(new_doc.element.body):
# 如果是段落,就在目标段落之后添加
if isinstance(element, CT_P):
# 考虑样式发生变化,对此进行调整
para = Paragraph(element, doc)
# 设置字体和字号
new_para = source_doc.add_paragraph(para.text, style='Normal')
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1,
new_para._element)
# 如果是表格,也在目标段落之后添加
elif isinstance(element, CT_Tbl):
source_doc.element.body.insert(source_doc.element.body.index(target_paragraph._element) + 1, element)
# source_doc.save(temp_path)
return source_doc
if __name__ == '__main__': if __name__ == '__main__':
doc_path = "data/3月23测试半成品.docx" doc_path = "data/2022年度德阳市旌阳区人民法院(1).docx"
# doc_path = 'data/特殊教育学校(1).docx' # doc_path = 'data/特殊教育学校(1).docx'
temp_path = "data/new_财务报告模板.docx" temp_path = "data/财务报告模板(2).doc"
copy_content_main(doc_path, temp_path) copy_content_main(doc_path, temp_path)
# docx_file = r'wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx' # docx_file = r'wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx'
# doc = Document(docx_file) # doc = Document(docx_file)
......
...@@ -110,8 +110,8 @@ def new_document(): ...@@ -110,8 +110,8 @@ def new_document():
return para._p return para._p
def generate_report(table_names_data, save_path, template_path, tables_dict): def generate_report(table_names_data, template_document, tables_dict):
document = Document(template_path) document = template_document
pattern = re.compile(r'(?<={{).*?(?=}})') pattern = re.compile(r'(?<={{).*?(?=}})')
# block 块对象主要包括标题、段落、图片、表、列表 # block 块对象主要包括标题、段落、图片、表、列表
# run 内联对象为块对象的组成部分,块对象的所有内容都包含在内联对象中,一个块对象由一个或多个内联对象组成。修改字体、字号、文字颜色需要用到run # run 内联对象为块对象的组成部分,块对象的所有内容都包含在内联对象中,一个块对象由一个或多个内联对象组成。修改字体、字号、文字颜色需要用到run
...@@ -139,7 +139,41 @@ def generate_report(table_names_data, save_path, template_path, tables_dict): ...@@ -139,7 +139,41 @@ def generate_report(table_names_data, save_path, template_path, tables_dict):
p = block._element p = block._element
p.getparent().remove(p) p.getparent().remove(p)
block._p = block._element = None block._p = block._element = None
document.save(save_path) # document.save(save_path)
return document
def new_generate_report(table_names_data, template_document, tables_dict):
document = template_document
pattern = re.compile(r'(?<={{).*?(?=}})')
# block 块对象主要包括标题、段落、图片、表、列表
# run 内联对象为块对象的组成部分,块对象的所有内容都包含在内联对象中,一个块对象由一个或多个内联对象组成。修改字体、字号、文字颜色需要用到run
# for block in iter_block_items(document):
for block in document.paragraphs:
if isinstance(block, Paragraph):
match = pattern.findall(block.text)
if match and "table" in match[0]:
table_name = match[0]
for _ in table_names_data[tables_dict[table_name]]:
# white_row = new_document()
# 在XML 级别上进行操作,即在元素之后直接添加内容,将任何尾部文本移动到新插入的元素后面,目的是使得新元素成为紧随其后的兄弟元素
# block._p.addnext(white_row)
block._p.addnext(_)
p = block._element
p.getparent().remove(p)
block._p = block._element = None
# 清除模板定义中的续表
pattern_clear = re.compile(r'(?<=续表)[0-9]')
for block in iter_block_items(document):
if isinstance(block, Paragraph):
match = pattern_clear.findall(block.text)
if match:
p = block._element
p.getparent().remove(p)
block._p = block._element = None
# document.save(save_path)
return document
if __name__ == '__main__': if __name__ == '__main__':
...@@ -147,7 +181,6 @@ if __name__ == '__main__': ...@@ -147,7 +181,6 @@ if __name__ == '__main__':
start_time = datetime.datetime.now() start_time = datetime.datetime.now()
# 参数:tables_dict、docx_file、save_path、template_path # 参数:tables_dict、docx_file、save_path、template_path
tables_dict = { tables_dict = {
"table13": "以名义金额计量的资产名称、数量等情况,以及以名义金额计量理由的说明",
"table5": "收入费用表(2)", "table5": "收入费用表(2)",
"table4": "收入费用表(1)", "table4": "收入费用表(1)",
"table3": "资产负债表续表2", "table3": "资产负债表续表2",
...@@ -168,5 +201,5 @@ if __name__ == '__main__': ...@@ -168,5 +201,5 @@ if __name__ == '__main__':
document = Document(docx_file) document = Document(docx_file)
data_result = get_choose_table(document, list(tables_dict.values())) data_result = get_choose_table(document, list(tables_dict.values()))
print(data_result) print(data_result)
generate_report(data_result, save_path=r'data/报告文件.docx', template_path=r'data/new_财务报告模板.docx', tables_dict=tables_dict) # generate_report(data_result, save_path=r'data/报告文件.docx', template_path=r'data/new_财务报告模板.docx', tables_dict=tables_dict)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 资源检测程序.py
# @Time : 2022/9/30 10:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import os, time, re, subprocess
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
# 获取内存负载信息
def get_mem_usage_percent():
try:
f = open('/proc/meminfo', 'r')
for line in f:
if line.startswith('MemTotal:'):
mem_total = int(line.split()[1])
elif line.startswith('MemFree:'):
mem_free = int(line.split()[1])
elif line.startswith('Buffers:'):
mem_buffer = int(line.split()[1])
elif line.startswith('Cached:'):
mem_cache = int(line.split()[1])
elif line.startswith('SwapTotal:'):
vmem_total = int(line.split()[1])
elif line.startswith('SwapFree:'):
vmem_free = int(line.split()[1])
else:
continue
f.close()
except:
return None
physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)
virtual_percent = 0
if vmem_total > 0:
virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)
return physical_percent, virtual_percent
def usage_percent(use, total):
try:
ret = (float(use) / total) * 100
except ZeroDivisionError:
raise Exception("ERROR - zero division error")
return ret
# 获取磁盘根目录占用信息
def disk_info():
statvfs = os.statvfs('/') # 根目录信息 可根据情况修改
total_disk_space = statvfs.f_frsize * statvfs.f_blocks
free_disk_space = statvfs.f_frsize * statvfs.f_bfree
disk_usage = (total_disk_space - free_disk_space) * 100.0 / total_disk_space
disk_usage = int(disk_usage)
# disk_tip = "硬盘空间使用率(最大100%):" + str(disk_usage) + "%"
# print(str(disk_usage))
return str(disk_usage)
# 获取内存占用信息
def mem_info():
mem_usage = get_mem_usage_percent()
mem_usage = int(mem_usage[0])
# mem_tip = "物理内存使用率(最大100%):" + str(mem_usage) + "%"
# print(str(mem_usage))
return str(mem_usage)
# 获取CPU占用信息
def cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
# 获取系统占用信息
def sys_info():
load_average = os.getloadavg()
# print(len(load_average))
# load_tip = "系统负载(三个数值中有一个超过3就是高):" + str(load_average)
return len(load_average)
# 获取计算机当前时间
def time_info():
now_time = time.strftime('%Y-%m-%d %H:%M:%S')
return "主机的当前时间:%s" % now_time
# 获取计算机主机名称
def hostname_info():
hostnames = os.popen("hostname").read().strip()
return "你的主机名是: %s" % hostnames
# 获取IP地址信息
def ip_info():
ipadd = os.popen("ip a| grep ens192 | grep inet | awk '{print $2}'").read().strip()
return ipadd
# 获取根的占用信息
def disk_info_root():
child = subprocess.Popen(["df", "-h"], stdout=subprocess.PIPE)
out = child.stdout.readlines()
for item in out:
line = item.strip().split()
# 我这里只查看centos的根
if '/dev/mapper/centos-root' in line:
title = [u'-文件系统-', u'--容量-', u'-已用-', u'-可用-', u'-已用-', u'-挂载点--']
content = "\t".join(title)
if eval(line[4][0:-1]) > 60:
line[0] = 'centos-root'
content += '\r\n' + '\t'.join(line)
return content
# 测试程序
# if __name__ == "__main__":
# disk_information = disk_info()
# disk_usage = [int(s) for s in re.findall(r'\b\d+\b', disk_information)]
# infomation = [hostname_info(), time_info(), disk_information]
# print(disk_usage)
# # 如果磁盘占用高于60%就发邮件告警
# if disk_usage[0] > 60:
# print("当前磁盘占用率已超过60%,建议清除磁盘内存!")
#
# # print(hostname_info())
# # print(time_info())
# # print(ip_info())
# print(sys_info())
# print(cpu_info())
# print(mem_info())
# print(disk_info())
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# @Author : ctt # @Author : ctt
# @File : 文本内容提取 # @File : 文本内容提取
# @Project : untitled1 # @Project : untitled1
import re import re, os
from docx import Document from docx import Document
import pandas as pd import pandas as pd
...@@ -40,7 +40,7 @@ class Extract: ...@@ -40,7 +40,7 @@ class Extract:
# {“主要职能”:””, “机构情况”:””, “人员情况”:””, “当年取得的主要事业成效”} # {“主要职能”:””, “机构情况”:””, “人员情况”:””, “当年取得的主要事业成效”}
def __init__(self): def __init__(self):
# self.main_functions = re.compile(r'(?<=[0-9][\..]主要职能[。\n])(.|\n)*?(?=[0-9][\..]机构情况[。\n])') # self.main_functions = re.compile(r'(?<=[0-9][\..]主要职能[。\n])(.|\n)*?(?=[0-9][\..]机构情况[。\n])')
self.main_functions = re.compile(r'(?<=[0-9][\..]主要职能[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])') self.main_functions = re.compile(r'(?<=([一二三四五六七八九十])基本情况[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])')
# self.institutional_situation = re.compile(r'(?<=[0-9][\..]机构情况[。\n])(.|\n)*?(?=[0-9][\..]人员情况[。\n])') # self.institutional_situation = re.compile(r'(?<=[0-9][\..]机构情况[。\n])(.|\n)*?(?=[0-9][\..]人员情况[。\n])')
# self.personnel_situation = re.compile(r'(?<=[0-9][\..]人员情况[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])') # self.personnel_situation = re.compile(r'(?<=[0-9][\..]人员情况[。\n])(.|\n)*?(?=([一二三四五六七八九十])当年取得的主要事业成效[。\n])')
self.business_results = re.compile(r'(?<=([一二三四五六七八九十])当年取得的主要事业成效[。\n])(.|\n)*?(?=[一二三四五六七八九十]、收入支出预算执行情况分析)') self.business_results = re.compile(r'(?<=([一二三四五六七八九十])当年取得的主要事业成效[。\n])(.|\n)*?(?=[一二三四五六七八九十]、收入支出预算执行情况分析)')
...@@ -74,12 +74,16 @@ def get_text_from_docx(filepath): ...@@ -74,12 +74,16 @@ def get_text_from_docx(filepath):
contents = [] contents = []
for paragraph in document.paragraphs: for paragraph in document.paragraphs:
if '<w:numPr>' in paragraph._element.xml: if '<w:numPr>' in paragraph._element.xml:
contents.append('1.'+paragraph.text) # print(paragraph.text)
contents.append('1.'+paragraph.text.replace("\xa0", ""))
contents.append('\n') contents.append('\n')
else: else:
contents.append(paragraph.text) # print(paragraph.text)
contents.append(paragraph.text.replace("\xa0", ""))
contents.append('\n') contents.append('\n')
return ''.join(contents) str_contents = ''.join(contents)
# return ''.join(contents)
return str_contents
def get_cover_content_from_docx(filepath): def get_cover_content_from_docx(filepath):
...@@ -116,26 +120,10 @@ def get_cover_content_from_docx(filepath): ...@@ -116,26 +120,10 @@ def get_cover_content_from_docx(filepath):
if __name__ == '__main__': if __name__ == '__main__':
new_path = "data/2022年度安岳县元坝镇人民政府部门决算分析报告(1).docx" # filepath = "data/wKjIbGRUpsaATABTAA5_0ejDDaQ144.docx"
document = get_text_from_docx(new_path)
data = Extract().extract_result(document)
print(data)
# fifth_area_pattern = re.compile(r'(?<=[0-9][\..]会计报表重要项目的明细信息及说明[。\n])(.|\n)*?(?=[0-9][\..]需要说明的其他事项[。\n])')
# filepath = "wKjIbGQeSb6AUq1aAAgAABcLaMw312.docx"
# document = Document(filepath)
# documents = get_text_from_docx(filepath)
#
# area_group = fifth_area_pattern.search(documents)
# if area_group:
# area_text = area_group.group().strip("1.").strip()
# else:
# area_text = ""
#
# print(area_text)
# cover_contents, other_contents = get_cover_content_from_docx(filepath) # cover_contents, other_contents = get_cover_content_from_docx(filepath)
# cover_pattern = re.compile(r"([0-9]{0,4}).*(?=(财务报告))") # cover_pattern = re.compile(r"([0-9]{0,4}).*(?=(财务报告))")
# # #
# # print(content)
# cover_group = cover_pattern.search(cover_contents) # cover_group = cover_pattern.search(cover_contents)
# if cover_group: # if cover_group:
# cover_text = cover_group.group().strip() # cover_text = cover_group.group().strip()
...@@ -147,13 +135,13 @@ if __name__ == '__main__': ...@@ -147,13 +135,13 @@ if __name__ == '__main__':
# other_data["reportTitle"] = cover_text # other_data["reportTitle"] = cover_text
# print(other_data) # print(other_data)
extract = Extract()
document = get_text_from_docx("data/2022年度安岳县元坝镇人民政府部门决算分析报告(1).docx")
data = Extract().extract_result(document)
print(data)
# path = r'D:\四川报告\相关代码\四川报告之文本内容提取\data'
# extract = Extract()
# # path = r'D:\四川报告\相关代码\四川报告之文本内容提取\data'
# path = "data/temp.docx"
# result = extract.extract_result(path)
# print(result)
# for file in os.listdir(path): # for file in os.listdir(path):
# if file[-4:] == 'docx': # if file[-4:] == 'docx':
# filepath = os.path.join(path, file) # filepath = os.path.join(path, file)
......
...@@ -12,6 +12,7 @@ from docx.oxml.text.paragraph import CT_P ...@@ -12,6 +12,7 @@ from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table, _Row from docx.table import _Cell, Table, _Row
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.shared import Pt
def iter_block_items(parent): def iter_block_items(parent):
...@@ -161,60 +162,137 @@ def get_other1_table(document, table_names: list): ...@@ -161,60 +162,137 @@ def get_other1_table(document, table_names: list):
return table_names_data return table_names_data
def read_document(document, old, new):
# 遍历表格
for table in document.tables:
for row in table.rows:
for cell in row.cells:
# 遍历表格段落内容,回到上个步骤,将cell当作paragraph处理
for paragraph in cell.paragraphs:
for run in paragraph.runs:
# 替换功能
if old in cell.text:
run.text = run.text.replace(old, new)
return document
def replace_document(document):
# 循环遍历所有段落
temp_list_0, temp_list_1 = [], []
for para in document.paragraphs:
# 使用replace()函数替换垂直制表符
if ("主要职能" in para.text and "机构情况" in para.text) or ("主要职能" in para.text and "&&&" in para.text):
temp_list_0 = para.text.split("&&&")
elif "&&&" in para.text:
temp_list_1 = para.text.split("&&&")
print(temp_list_0)
print(temp_list_1)
# 依次在第一部分添加目标内容
for temp in temp_list_0:
if temp:
for i, p in enumerate(document.paragraphs):
if ("主要职能" in p.text and "机构情况" in p.text) or ("主要职能" in p.text and "&&&" in p.text):
target_para = document.paragraphs[i]
new_para = target_para.insert_paragraph_before(temp)
# todo: 添加段落样式
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
# new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
new_para.paragraph_format.line_spacing = Pt(23.4)
# 置换原来段落内容为空
for p in document.paragraphs:
if ("主要职能" in p.text and "机构情况" in p.text) or ("主要职能" in p.text and "&&&" in p.text):
p.text = p.text.replace(p.text, "")
document._element.body.remove(p._element)
# 依次在第二部分添加目标内容
for temp in temp_list_1:
if temp:
for i, p in enumerate(document.paragraphs):
if "&&&" in p.text:
target_para = document.paragraphs[i]
new_para = target_para.insert_paragraph_before(temp)
# todo: 添加段落样式
font = new_para.runs[0].font
font.name = "宋体"
font.size = Pt(12)
# new_para.paragraph_format.space_before = Pt(12)
new_para.paragraph_format.first_line_indent = Pt(25)
new_para.paragraph_format.line_spacing = Pt(23.4)
# 置换原来段落内容为空
for p in document.paragraphs:
if "&&&" in p.text:
p.text = p.text.replace(p.text, "")
document._element.body.remove(p._element)
# 检查并置换目标段落为空
for para in document.paragraphs:
if "{{info.amountDescription}}" in para.text:
para.text = para.text.replace(para.text, "")
document._element.body.remove(para._element)
# document.save("data/test.docx")
return document
if __name__ == '__main__': if __name__ == '__main__':
docx_file = r'data/3月23测试半成品.docx'
docx_file = r'data/20230512204902_5月10号财务报告测试.docx'
document = Document(docx_file) document = Document(docx_file)
table_names = ['货币资金明细信息如下'] replace_document(document)
print(get_other1_table(document, table_names)) # 循环遍历所有段落
# temp_list_0, temp_list_1 = [], []
# import datetime # for para in document.paragraphs:
# start_time = datetime.datetime.now() # # 使用replace()函数替换垂直制表符
# docx_file = r'data/四川报告模板.docx' # if "主要职能&&&" in para.text:
# document = Document(docx_file) # temp_list_0 = para.text.split("&&&")
# data = get_choose_table(document, ['资产负债表', '收入费用表(1)', '收入费用表(2)']) # elif "&&&" in para.text:
# # 处理资产负债表 # temp_list_1 = para.text.split("&&&")
# temp_list = data["资产负债表"]
# temp_dict = {}
# #
# for temp in temp_list: # for temp in temp_list_0:
# temp_text = re.sub(":", ":", temp["项目"]) # if temp:
# if temp_text.endswith(":"): # for i, p in enumerate(document.paragraphs):
# temp_dict.update({"temp_key": temp_text}) # if "主要职能&&&" in p.text:
# continue # target_para = document.paragraphs[i]
# else: # new_para = target_para.insert_paragraph_before(temp)
# temp["上级项目"] = temp_dict["temp_key"].strip(":") # # todo: 添加段落样式
# font = new_para.runs[0].font
# font.name = "宋体"
# font.size = Pt(12)
# # new_para.paragraph_format.space_before = Pt(12)
# new_para.paragraph_format.first_line_indent = Pt(25)
# new_para.paragraph_format.line_spacing = Pt(23.4)
# #
# for p in document.paragraphs:
# if "主要职能&&&" in p.text:
# p.text = p.text.replace(p.text, "")
# document._element.body.remove(p._element)
# #
# # 处理收入费用表(1) # for temp in temp_list_1:
# temp_list_0 = data["收入费用表(1)"] # if temp:
# temp_dict_0 = {"temp_key": "收入合计"} # for i, p in enumerate(document.paragraphs):
# # updata_list = ["收入合计", "本年盈余"] # if "&&&" in p.text:
# for temp_0 in temp_list_0: # target_para = document.paragraphs[i]
# if temp_0["项目"].strip() == "收入合计": # new_para = target_para.insert_paragraph_before(temp)
# temp_dict_0.update({"temp_key": "本年盈余"}) # # todo: 添加段落样式
# else: # font = new_para.runs[0].font
# if temp_0["项目"].strip() == "本年盈余": # font.name = "宋体"
# continue # font.size = Pt(12)
# else: # # new_para.paragraph_format.space_before = Pt(12)
# temp_0["上级项目"] = temp_dict_0["temp_key"] # new_para.paragraph_format.first_line_indent = Pt(25)
# new_para.paragraph_format.line_spacing = Pt(23.4)
# #
# # 处理收入费用表(2) # for p in document.paragraphs:
# temp_list_1 = data["收入费用表(2)"] # if "&&&" in p.text:
# temp_dict_1 = {"temp_key": "收入合计"} # p.text = p.text.replace(p.text, "")
# # updata_list = ["收入合计", "本年盈余"] # document._element.body.remove(p._element)
# for temp_1 in temp_list_1: #
# if temp_1["项目"].strip() == "收入合计": # for para in document.paragraphs:
# temp_dict_1.update({"temp_key": "本年盈余"}) # if "{{info.amountDescription}}" in para.text:
# else: # para.text = para.text.replace(para.text, "")
# if temp_1["项目"].strip() == "本年盈余": # document._element.body.remove(para._element)
# continue
# else: document.save("data/test.docx")
# temp_1["上级项目"] = temp_dict_1["temp_key"]
# print(data)
# end_time = datetime.datetime.now()
# print(start_time)
# print(end_time)
# print("耗时: {}秒".format(end_time - start_time))
......
...@@ -11,14 +11,14 @@ from pyecharts.charts import Pie, Bar, Line, Grid ...@@ -11,14 +11,14 @@ from pyecharts.charts import Pie, Bar, Line, Grid
from pyecharts.faker import Faker from pyecharts.faker import Faker
from pyecharts.render import make_snapshot # 导入输出图片工具 from pyecharts.render import make_snapshot # 导入输出图片工具
from snapshot_selenium import snapshot # 使用snapshot-selenium 渲染图片 from snapshot_selenium import snapshot # 使用snapshot-selenium 渲染图片
from pyecharts.globals import CurrentConfig, ThemeType from pyecharts.globals import CurrentConfig
from pathlib import Path from pathlib import Path
# import time # import time
from unittest import mock from unittest import mock
from base.config.base_config import root_dir from base.config.base_config import root_dir
from utils.tools import timeit from utils.tools import timeit
# root_dir = '..' import threading
# 解决linux 下图片生成失败问题 # 解决linux 下图片生成失败问题
...@@ -32,16 +32,6 @@ def get_chrome_driver(): ...@@ -32,16 +32,6 @@ def get_chrome_driver():
return webdriver.Chrome(options=options) return webdriver.Chrome(options=options)
# def timeit(f):
# def timed(*args, **kw):
# ts = time.time()
# print('......begin {0:8s}......'.format(f.__name__))
# result = f(*args, **kw)
# te = time.time()
# print('......finish {0:8s}, took:{1:.4f} sec......'.format(f.__name__, te - ts))
# return result
#
# return timed
""" """
关于: [图片生成的中文字体样式渲染问题] 关于: [图片生成的中文字体样式渲染问题]
...@@ -57,10 +47,13 @@ CurrentConfig.ONLINE_HOST = 'http://39.105.62.235:8000/assets/' ...@@ -57,10 +47,13 @@ CurrentConfig.ONLINE_HOST = 'http://39.105.62.235:8000/assets/'
pic_echarts_dir = os.path.join(root_dir, 'generate/echarts') pic_echarts_dir = os.path.join(root_dir, 'generate/echarts')
Path(pic_echarts_dir).mkdir(parents=True, exist_ok=True) Path(pic_echarts_dir).mkdir(parents=True, exist_ok=True)
lock = threading.RLock()
@timeit @timeit
def pic_echarts_pie(keys: list, values: list, title: str or None) -> str: def pic_echarts_pie(keys: list, values: list, title: str or None, pic_echarts_path: str) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_pie.png') # pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_pie.png')
# pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
with lock:
pie = ( pie = (
Pie().add( Pie().add(
series_name='', series_name='',
...@@ -71,26 +64,23 @@ def pic_echarts_pie(keys: list, values: list, title: str or None) -> str: ...@@ -71,26 +64,23 @@ def pic_echarts_pie(keys: list, values: list, title: str or None) -> str:
title_opts=opts.TitleOpts(title=title), title_opts=opts.TitleOpts(title=title),
legend_opts=opts.LegendOpts(type_='scroll', pos_left='80%', orient='vertical', textstyle_opts=opts.TextStyleOpts(font_size=20)) legend_opts=opts.LegendOpts(type_='scroll', pos_left='80%', orient='vertical', textstyle_opts=opts.TextStyleOpts(font_size=20))
).set_series_opts( ).set_series_opts(
# label_opts=opts.LabelOpts(formatter='{b}: {c}({d}%)') label_opts=opts.LabelOpts(formatter="{b}: {d}%", font_size=18)
label_opts=opts.LabelOpts(formatter="{b}: {d}%", font_size=20)
) )
# # 设置标签字体大小
# .set_series_opts(label_opts=opts.LabelOpts(font_size=22))
) )
# print("当前处理的数据集key{},和value{}".format(keys, values))
with lock:
with mock.patch('snapshot_selenium.snapshot.get_chrome_driver', get_chrome_driver): with mock.patch('snapshot_selenium.snapshot.get_chrome_driver', get_chrome_driver):
make_snapshot(snapshot, pie.render(), pic_echarts_path) make_snapshot(snapshot, pie.render(), pic_echarts_path)
# make_snapshot(snapshot, pie.render(), pic_echarts_path) # make_snapshot(snapshot, pie.render(), pic_echarts_path)
return pic_echarts_path return pic_echarts_path
@timeit @timeit
def pic_echarts_bar( def pic_echarts_bar(
keys: list, dict_values: dict, title=None, keys: list, dict_values: dict, temp_file_name: str, title=None,
x_name=None, y_name=None x_name=None, y_name=None,
) -> str: ) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_bar.png') pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
bar = ( bar = (
Bar().add_xaxis( Bar().add_xaxis(
xaxis_data=keys xaxis_data=keys
...@@ -113,10 +103,10 @@ def pic_echarts_bar( ...@@ -113,10 +103,10 @@ def pic_echarts_bar(
@timeit @timeit
def pic_echarts_line( def pic_echarts_line(
keys: list, dict_values: dict, title=None, keys: list, dict_values: dict, temp_file_name: str, title=None,
x_name=None, y_name=None x_name=None, y_name=None
) -> str: ) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_line.png') pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
line = ( line = (
Line().add_xaxis( Line().add_xaxis(
xaxis_data=keys xaxis_data=keys
...@@ -193,6 +183,7 @@ def pic_echarts_line_test() -> None: ...@@ -193,6 +183,7 @@ def pic_echarts_line_test() -> None:
@timeit @timeit
def pic_echarts_bar_line( def pic_echarts_bar_line(
temp_file_name: str,
keys=['2016年报', '2017年报', '2018年报', '2019年报', '2020年报', '2021年报'], keys=['2016年报', '2017年报', '2018年报', '2019年报', '2020年报', '2021年报'],
dict_bar_values={ dict_bar_values={
'总资产': [1905.11, 1998.17, 2009.65, 2031.37, 1950.35, 1988.65], '总资产': [1905.11, 1998.17, 2009.65, 2031.37, 1950.35, 1988.65],
...@@ -205,7 +196,7 @@ def pic_echarts_bar_line( ...@@ -205,7 +196,7 @@ def pic_echarts_bar_line(
title='资产负债表(CNY)', title='资产负债表(CNY)',
x_name='年度', y_name_left='金额/(亿元)', y_name_right='负债率/(%)' x_name='年度', y_name_left='金额/(亿元)', y_name_right='负债率/(%)'
) -> str: ) -> str:
pic_echarts_path = os.path.join(pic_echarts_dir, 'echarts_bar_line.png') pic_echarts_path = os.path.join(pic_echarts_dir, temp_file_name)
bar = ( bar = (
Bar().add_xaxis( Bar().add_xaxis(
xaxis_data=keys xaxis_data=keys
...@@ -342,7 +333,8 @@ def pic_echarts_bar_line_test() -> str: ...@@ -342,7 +333,8 @@ def pic_echarts_bar_line_test() -> str:
if __name__ == '__main__': if __name__ == '__main__':
pic_echarts_pie(keys=Faker.choose(), values=Faker.values(), title='Echarts Pie 标题1')
# pic_echarts_pie(keys=Faker.choose(), values=Faker.values(), title='Echarts Pie 标题1')
# pic_echarts_bar( # pic_echarts_bar(
# keys=Faker.choose(), # keys=Faker.choose(),
# dict_values={ # dict_values={
...@@ -363,4 +355,4 @@ if __name__ == '__main__': ...@@ -363,4 +355,4 @@ if __name__ == '__main__':
# y_name='Y轴名称' # y_name='Y轴名称'
# ) # )
# pic_echarts_line_test() # pic_echarts_line_test()
# pic_echarts_bar_line() pic_echarts_bar_line()
...@@ -11,7 +11,7 @@ from flask import request ...@@ -11,7 +11,7 @@ from flask import request
from flask import Flask, send_file from flask import Flask, send_file
# from transform_doc_to_docx import doc2docx, closesoft # from transform_doc_to_docx import doc2docx, closesoft
import subprocess import subprocess
from generate.gen_user_report_auto_generated import main_process from generate.gen_user_report_auto_generated import new_main_process
UPLOAD_FOLDER = r'data' # 上传路径 UPLOAD_FOLDER = r'data' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True) Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
abs_path = os.path.split(os.path.realpath(__file__))[0] abs_path = os.path.split(os.path.realpath(__file__))[0]
...@@ -64,7 +64,7 @@ def generate_report(template_path, document_path, report_name, object): ...@@ -64,7 +64,7 @@ def generate_report(template_path, document_path, report_name, object):
half_work_path = os.path.join(UPLOAD_FOLDER, template_filename) half_work_path = os.path.join(UPLOAD_FOLDER, template_filename)
else: else:
return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!" return "上传文件格式有误,当前仅支持doc 和 docx 格式,请选择正确文件重新上传!"
main_process(half_work_path, tables_dict, template_path, report_name, data_object, save_path) # main_process(half_work_path, tables_dict, template_path, report_name, data_object, save_path)
# send_path = os.path.join(UPLOAD_FOLDER, report_name) # send_path = os.path.join(UPLOAD_FOLDER, report_name)
# return send_file(send_path, as_attachment=True) # return send_file(send_path, as_attachment=True)
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : main_server.py
# @Time : 2023/5/15 15:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import requests
import threading
import sys
import json
from pathlib import Path
sys.path.append('../')
# 关闭多余连接
s = requests.session()
s.keep_alive = False
from detector_source import sys_info, cpu_info, mem_info
# 文件上传服务器定义
from fdfs_client.client import *
tracker_conf = get_tracker_conf("data/fdfs_client.conf")
client = Fdfs_client(tracker_conf)
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# 创建一个filehandler,用于将错误日志输出到文件,并设置日志级别
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(os.path.join(_tmp_path, "main_server_error.log"))
fh1 = logging.FileHandler(os.path.join(_tmp_path, "main_server_info.log"))
fh.setLevel(level=logging.ERROR)
fh1.setLevel(level=logging.INFO)
fh.setFormatter(formatter)
fh1.setFormatter(formatter)
# 同时将日志输出到控制台和文件
logger.addHandler(ch)
logger.addHandler(fh)
logger.addHandler(fh1)
# TODO: 定义进程存放列表
all_thread = []
"""
测试地址:http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus
正式地址:http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus
"""
# callback_url = "http://114.115.185.13:9988/datapull/aiReport/report/callbackStatus"
callback_url = "http://192.168.1.70:9988/datapull/aiReport/report/callbackStatus"
# todo: 合并段落和句子去重处理
def main_process(half_document_path, tables_dict, template_path, data_object, report_id,
output_report_path, report_name, template_id):
# 输入半成品、模板和必要数据,对模板进行更新,输出更新后的模板路径
try:
from generate.gen_user_report_auto_generated import new_main_process
from docx import Document
half_document = Document(half_document_path)
template_document = Document(template_path)
report_processed_path = new_main_process(half_document=half_document,
tables_dict=tables_dict,
template_document=template_document,
data_object=data_object,
report_id=report_id,
output_report_path=output_report_path)
# todo: 增加去重表格中的圆圈【空格】
report_processed = Document(report_processed_path)
from extract_table import read_document, replace_document
report_processed = read_document(report_processed, "\u00A0", "")
# todo: 增加缺失值项替换为空
report_processed = replace_document(report_processed)
final_report_name = '{}_{}.docx'.format(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')), report_name)
UPLOAD_FOLDER = r'data/' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
send_path = os.path.join(UPLOAD_FOLDER, final_report_name)
report_processed.save(send_path)
ret_upload = client.upload_by_filename(send_path)
logger.info(ret_upload["Remote file_id"])
# b'group1/M00/00/0A/wKjIlGRjcHiAVnTuAAEo5wnGJLQ89.docx'
str_ret_upload = str(ret_upload["Remote file_id"])
filePath = str_ret_upload.strip('b').replace("'", "").strip()
logger.info(filePath)
dict_result = {
"status": "0", # 处理状态 ("0" 成功, "1" 失败)
"result": "处理成功",
"id": report_id, # 报告id
"templeteId": template_id, # 报告模板
"filePath": filePath
}
logger.info(dict_result)
# todo: 调用java的状态更新接口返回异常的结果
payload = json.dumps(dict_result)
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=callback_url,
headers=headers, data=payload)
logger.info(r1.text)
except Exception as e:
dict_result = {
"status": "1", # 处理状态 (0 成功, 1 失败)
"result": "生成失败!+ {}".format(str(e)),
"id": report_id, # 报告id
"templeteId": template_id, # 报告模板
"filePath": ""
}
# todo: 调用java的状态更新接口返回异常的结果
payload = json.dumps(dict_result)
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=callback_url,
headers=headers, data=payload)
logger.info(r1.text)
return dict_result
def env_eval():
# todo 获取资源相关信息(磁盘占用率、系统占用信息【超过3个为高】、CPU占用率、物理内存占用率)
# disk_usage = disk_info()
sys_usage = sys_info()
cpu_usage = cpu_info()
men_usage = mem_info()
# todo 资源不够用时,返回 False
if sys_usage > 20 or cpu_usage > str(95) or men_usage > str(95):
return False
# todo 资源够用时,返回 True
return True
def system_start():
while True:
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
r1_json = json.loads(r1.text)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
time.sleep(6)
else:
# todo: 若任务队列不为空进行报告处理
for i in range(queue_left_number):
r2 = requests.post(url='http://localhost:4000/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
report_id = config_info["report_id"]
template_id = config_info["template_id"]
half_document_path = config_info["half_document_path"]
tables_dict = config_info["tables_dict"]
template_path = config_info["template_path"]
data_object = config_info["data_object"]
output_report_path = config_info["output_report_path"]
report_name = config_info["report_name"]
logger.info('##########报告生成服务###############')
t = threading.Thread(target=main_process, args=(half_document_path, tables_dict, template_path,
data_object, report_id, output_report_path,
report_name, template_id),
daemon=True)
while True:
if env_eval():
break
else:
time.sleep(6)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动服务
r1 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url='http://localhost:4000/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行数据去重入库服务 ...')
break
else:
logger.info('队列为空!可放心进行数据去重入库服务 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url='http://localhost:4000/queue_size', headers=headers)
# todo: 如果服务启动,即接口访问正常,则返回“真”
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError" + str(e))
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!报告生成服务已启动...")
break
if __name__ == '__main__':
# 开始启动后台处理服务
start_up_check()
logger.info('报告生成服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('报告生成服务恢复完成!')
logger.info('报告生成服务运行中 ...')
system_start()
...@@ -16,8 +16,8 @@ timeout = 300 # 超时 -> 目前为迎合ZZSN_NLP平 ...@@ -16,8 +16,8 @@ timeout = 300 # 超时 -> 目前为迎合ZZSN_NLP平
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式 # worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12 # workers = multiprocessing.cpu_count() # 进程数 12
workers = 3 # 低资源 13G 服务器负载过大可调整此处为 1 workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数 threads = 10 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置 loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置 access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
......
...@@ -2,14 +2,20 @@ ...@@ -2,14 +2,20 @@
# description: auto_run # description: auto_run
# 四川报告生成监控 # 四川报告生成监控
# 检测脚本是否在运行,若已经在运行,则等待一段时间后再次检查,若未启动则进行启动 # 检测脚本是否在运行,若已经在运行,则等待一段时间后再次检查,若未启动则进行启动
function start_interface() { # echo "114.115.185.13 dfs" >>/etc/hosts
#echo "192.168.200.148 dfs" >>/etc/hosts # 测试环境
echo "192.168.1.75 dfs" >>/etc/hosts # 正式环境
start_interface() {
INTERFACE_IS_STRAT=`ps -ef | grep scbg_app_config.py | grep -v grep | wc -l` INTERFACE_IS_STRAT=`ps -ef | grep scbg_app_config.py | grep -v grep | wc -l`
if [ $INTERFACE_IS_STRAT -eq 4 ] ; then if [ $INTERFACE_IS_STRAT -ne 0 ] ; then
usleep sleep 30m
else else
echo "=========Service Will Start==========" echo "=========Service Will Start=========="
# cd /data/lzc/scbg-python/SCBG-PYTHON && nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 & # cd /data/lzc/scbg-python/SCBG-PYTHON && nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 &
cd /opt/SCBG-PYTHON && exec nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 & cd /opt/SCBG-PYTHON
exec nohup gunicorn -c scbg_app_config.py app_run:app 2>&1 &
sleep 5m
exec nohup python -u main_server.py >main_server.log 2>&1 &
echo "=========Service Start Completed!========" echo "=========Service Start Completed!========"
fi fi
......
...@@ -42,4 +42,4 @@ def doc2docx(path): ...@@ -42,4 +42,4 @@ def doc2docx(path):
if __name__ == '__main__': if __name__ == '__main__':
closesoft() closesoft()
doc2docx(r'D:\四川报告\相关代码\从word中提取指定表格\data\特殊教育学校(1).doc') doc2docx(r'D:/四川报告/相关代码/从word中提取指定表格/data/四川省化工作质量安全检测研究院2022年度部门决算分析报告(1).doc')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论