提交 959035c6 作者: chenshiqiang

add filter typeId

上级 a4afd31e
......@@ -183,6 +183,19 @@
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<!--word-html处理工具-->
<dependency>
<groupId>com.aspose</groupId>
<artifactId>aspose-words</artifactId>
<version>15.12.0</version>
<scope>system</scope>
<systemPath>${basedir}/lib/aspose-words-15.12.0-jdk16.jar</systemPath>
</dependency>
<dependency>
<groupId>org.fusesource.hawtbuf</groupId>
<artifactId>hawtbuf</artifactId>
<version>1.11</version>
</dependency>
</dependencies>
......
......@@ -37,6 +37,8 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
......@@ -90,10 +92,11 @@ class KnowledgeServiceImpl implements IKnowledgeService {
List<String> contentStringList = new ArrayList<>();
String html = null;
try {
html = DocUtil.docParseHtml(filesStorage + knowledge.getFiles().get(0).getFilePath());
File file = new File(filesStorage + knowledge.getFiles().get(0).getFilePath());
html =DocUtil.convertDocStream2Html(new FileInputStream(file));
String htmlWithTable = html.replace("<p>", "");
contentStringList = Arrays.asList(htmlWithTable.split("</p>"));
} catch (IOException e) {
} catch (Exception e) {
e.printStackTrace();
}
List<String> messageContentList = contentStringList.stream()
......
package com.zzsn.knowbase.util;
import cn.hutool.core.io.IoUtil;
import com.aspose.words.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.SystemUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
import org.zwobble.mammoth.DocumentConverter;
import org.zwobble.mammoth.Result;
import org.fusesource.hawtbuf.DataByteArrayOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 文档工具
......@@ -18,6 +31,236 @@ import java.util.Set;
@Component
public class DocUtil {
private static final Logger logger = LoggerFactory.getLogger(DocUtil.class);
private static final String fontsPath = "/usr/share/fonts";
// static{
// System.setProperty("java.io.tmpdir", "F:\\temp_file\\temp");
// }
public static String getValueAfterReplaceSpecialWord(String str) {
if (StringUtils.isNotBlank(str)) {
String strs = filterASCII(filterUnicode(str.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
.replace("\"", "&quot;").replace("'", "&apos;")));
return strs;
} else {
return "";
}
}
public static String filterASCII(String source) {
if (source != null && source.length() > 0) {
char[] sourceCharArr = source.toCharArray();
for (int i = 0; i < sourceCharArr.length; i++) {
if (sourceCharArr[i] < 0x20 || sourceCharArr[i] == 0x7F) {
sourceCharArr[i] = 0x20;
}
}
return new String(sourceCharArr);
}
return "";
}
/**
* 获取license
*
* @return
*/
public static boolean getLicense() {
boolean result = false;
try {
Resource resource = new ClassPathResource("license.xml");
InputStream is = resource.getInputStream();
License aposeLic = new License();
aposeLic.setLicense(is);
result = true;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
/**
* word文件流转成htm 内容
*
* @param inputStream
* @return
* @throws Exception
*/
public static String convertDocStream2Html(InputStream inputStream) throws Exception {
getLicense();
Document doc = new Document(inputStream);
HtmlSaveOptions saveOptions = new HtmlSaveOptions(SaveFormat.HTML);
saveOptions.setExportHeadersFootersMode(ExportHeadersFootersMode.NONE); // HtmlSaveOptions的其他设置信息请参考相关API
saveOptions.setCssStyleSheetType(CssStyleSheetType.INLINE);
//指定是将字体资源导出到HTML,MHTML还是EPUB。默认值为false。
// saveOptions.setFontsFolderAlias("font");
// saveOptions.setFontSavingCallback(fontSavingArgs -> fontSavingArgs.getFontStream());
//指定是否应使用Base64编码将字体资源嵌入HTML。默认值为false。
saveOptions.setExportImagesAsBase64(true);
//指定页面设置是导出到HTML,MHTML还是EPUB。默认值为false。
saveOptions.setExportPageSetup(true);
//指定在保存到HTML,MHTML或EPUB时是否应以相对单位输出字体大小。默认值为false。
// saveOptions.setExportRelativeFontSize(true);
//控制文本输入表单字段如何保存到HTML或MHTML。默认值为false。
saveOptions.setExportTextInputFormFieldAsText(true);
//如果为true,则在适用的情况下输出漂亮的格式。默认值为false。
// saveOptions.setPrettyFormat(true);
//获取或设置一个值,该值确定是否使用高质量(即慢速)渲染算法。(继承自SaveOptions)
saveOptions.setUseHighQualityRendering(true);
// saveOptions.setDocumentSplitCriteria(DocumentSplitCriteria.HEADING_PARAGRAPH);
//控制如何将表格、行和单元格宽度导出为HTML、MHTML或EPUB。默认值为HtmlElementSizeOutputMode.ALL。该属性的值为HtmlElementSizeOutputMode整数常量。
saveOptions.setTableWidthOutputMode(HtmlElementSizeOutputMode.RELATIVE_ONLY);
saveOptions.setExportTocPageNumbers(true);
//指定保存为HTML、MHTML或EPUB时是否规范化段落的负左缩进和负右缩进。默认值为false。
saveOptions.setAllowNegativeIndent(true);
//指定是否按Aspose缩放图像。导出到HTML、MHTML或EPUB时,文字的边界形状大小。默认值为true。
saveOptions.setScaleImageToShapeSize(true);
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
String htmlText = "";
try {
doc.save(htmlStream, saveOptions);
htmlText = new String(htmlStream.toByteArray(), "UTF-8");
} catch (Exception e) {
e.printStackTrace();
}finally {
IoUtil.close(htmlStream);
IoUtil.close(inputStream);
}
return htmlText;
}
/**
* 根据html内容转为文件流
*
* @param content
* @return
* @throws Exception
*/
public static byte[] convertDocHtml2Doc(String content, Boolean isPage) throws Exception {
getLicense();
if (SystemUtils.IS_OS_LINUX) {
logger.info("doc set font folder");
FontSettings.setFontsFolder(fontsPath, false);
}
Document doc = new Document();
DataByteArrayOutputStream dataByteArrayOutputStream = new DataByteArrayOutputStream();
DocumentBuilder builder = new DocumentBuilder(doc);
if (!isPage)
content = convertDocHtml2Page(content);
builder.insertHtml(content);
doc.save(dataByteArrayOutputStream, SaveFormat.DOCX);
return dataByteArrayOutputStream.getData();
}
/**
* @param content
* @return
* @throws Exception
*/
public static byte[] convertDocHtml2Png(String content, Boolean isPage) throws Exception {
getLicense();
if (SystemUtils.IS_OS_LINUX) {
logger.info("cover png set font folder");
FontSettings.setFontsFolder(fontsPath, false);
}
Document doc = new Document();
DataByteArrayOutputStream dataByteArrayOutputStream = new DataByteArrayOutputStream();
DocumentBuilder builder = new DocumentBuilder(doc);
if (!isPage)
content = convertDocHtml2Page(content);
builder.insertHtml(content);
doc.save(dataByteArrayOutputStream, SaveFormat.PNG);
return dataByteArrayOutputStream.getData();
}
public static byte[] convertDocHtml2Pdf(String content, boolean isPage) throws Exception {
getLicense();
if (SystemUtils.IS_OS_LINUX) {
logger.info("pdf set font folder");
FontSettings.setFontsFolder(fontsPath, false);
}
Document doc = new Document();
DataByteArrayOutputStream dataByteArrayOutputStream = new DataByteArrayOutputStream();
DocumentBuilder builder = new DocumentBuilder(doc);
if (!isPage)
content = convertDocHtml2Page(content);
builder.insertHtml(content);
doc.save(dataByteArrayOutputStream, SaveFormat.PDF);
return dataByteArrayOutputStream.getData();
}
/**
* 富文本标签内容转成标准html 页面
*
* @param content
* @return
*/
public static String convertDocHtml2Page(String content) {
if (StringUtils.isNotBlank(content)) {
StringBuilder sb = new StringBuilder();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<meta http-equiv=\"Content-Style-Type\" content=\"text/css\" />");
sb.append("<title>数智化报告</title>");
sb.append("<style type=\"text/css\">");
sb.append("body { font-family:SimSun, FangSong, Microsoft YaHei; }");//默认宋体,仿宋,雅黑
sb.append("</style>");
sb.append("</head><body>");
sb.append(content);
sb.append("</body></html>");
return sb.toString();
}
return "";
}
/**
* 去除特殊符号,但不去除换行符号
*/
public static String getValueAfterReplaceSpecialWordNotEnter(String str) {
if (org.apache.commons.lang3.StringUtils.isEmpty(str)) {
return "";
}
return filterASCIINotEnter(filterUnicode(str.replace("&", "&amp;")
.replace("<", "&lt;").replace(">", "&gt;")
.replace("\"", "&quot;").replace("'", "&apos;")));
}
/**
* 过滤ASCII码中的不可见字符 ,不包括换行
* 换行在ASCII表中对应的值为 10和 13
*/
public static String filterASCIINotEnter(String source) {
if (org.apache.commons.lang3.StringUtils.isBlank(source)) {
return "";
}
char[] sourceCharArr = source.toCharArray();
for (int i = 0; i < sourceCharArr.length; i++) {
// 换行字符
if (sourceCharArr[i] == 0x0A || sourceCharArr[i] == 0x0D) {
continue;
}
if (sourceCharArr[i] < 0x20 || sourceCharArr[i] == 0x7F) {
sourceCharArr[i] = 0x20;
}
}
return new String(sourceCharArr);
}
public static String filterUnicode(String source) {
Pattern parttern = Pattern.compile("([\\u007f-\\u009f]|\\u00ad|[\\u0483-\\u0489]|[\\u0559-\\u055a]|\\u058a|[\\u0591-\\u05bd]|\\u05bf|[\\u05c1-\\u05c2]|[\\u05c4-\\u05c7]|[\\u0606-\\u060a]|[\\u063b-\\u063f]|\\u0674|[\\u06e5-\\u06e6]|\\u070f|[\\u076e-\\u077f]|\\u0a51|\\u0a75|\\u0b44|[\\u0b62-\\u0b63]|[\\u0c62-\\u0c63]|[\\u0ce2-\\u0ce3]|[\\u0d62-\\u0d63]|\\u135f|[\\u200b-\\u200f]|[\\u2028-\\u202e]|\\u2044|\\u2071|[\\uf701-\\uf70e]|[\\uf710-\\uf71a]|\\ufb1e|[\\ufc5e-\\ufc62]|\\ufeff|\\ufffc)");
Matcher m = parttern.matcher(source);
if (m.find()) {
return m.replaceAll("");
}
return source;
}
/**
* 文档解析
......
<License>
<Data>
<Products>
<Product>Aspose.Total for Java</Product>
<Product>Aspose.Words for Java</Product>
</Products>
<EditionType>Enterprise</EditionType>
<SubscriptionExpiry>20991231</SubscriptionExpiry>
<LicenseExpiry>20991231</LicenseExpiry>
<SerialNumber>8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7</SerialNumber>
</Data>
<Signature>sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=</Signature>
</License>
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论