package com.zzsn.knowbase.util;

import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.IdWorker;
import com.obs.services.model.PutObjectResult;
import com.zzsn.knowbase.constant.Constants;
import com.zzsn.knowbase.constant.DirEnum;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import com.zzsn.knowbase.vo.DocEntity;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.multipart.MultipartFile;

import javax.annotation.Resource;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * @Description: 报告素材工具类
 * @Version: V1.0
 */
@Slf4j
public class ReportUtil {
//    @Autowired
//    private static ObsUtil obsUtil;
//    @Resource
//    private static StreamBridge streamBridge;

    public static List<Map<String, Object>> getList(List<String> originList) {
        Stack<Map<String,Object>> stack = new Stack<>();

        Map<String, Object> map1 = new HashMap<>();
        map1.put("id", null);
        map1.put("text", null);
        map1.put("level", 1000);
        stack.push(map1);


        List<Map<String,Object>> list = new ArrayList<>();
        int sort = 1;
        for (String text : originList) {
            Map<String, Object> map = new HashMap<>();
            map.put("id", IdWorker.getIdStr());
            map.put("text", text);
            map.put("level", getLevel(text));
            getParent(map, stack);
            if (canPush(map,stack)){
                stack.push(map);
            }
            map.put("contentType", getContentType(text));
            map.put("sort", sort++);
            list.add(map);
        }

        stack.clear();
        return list;
    }

    /**
     * 通过传入的文本开头标签，判断文本类型 p/h1/h2/h3/h4/img/table
     * @param text
     * @return
     */
    private static String getContentType(String text){
        if (text.startsWith("<p")) {
            return "p";
        } else if (text.startsWith("<h1")) {
            return "h1";
        } else if (text.startsWith("<h2")) {
            return "h2";
        } else if (text.startsWith("<h3")) {
            return "h3";
        } else if (text.startsWith("<h4")) {
            return "h4";
        } else if (text.startsWith("<h5")) {
            return "h5";
        } else if (text.startsWith("<h6")) {
            return "h6";
        } else if (text.startsWith("<h7")) {
            return "h7";
        } else if (text.startsWith("<h8")) {
            return "h8";
        } else if (text.startsWith("<h9")) {
            return "h9";
        } else if (text.startsWith("<img")) {
            return "img";
        } else if (text.startsWith("<table")) {
            return "table";
        } else {
            return "";
        }
    }
    /**
     * 通过传入的文本类型
     * @param text
     * @return
     */
    private static String getTextType(String text){
        if (text.equals("p")) {
            return "内容";
        }else if (text.contains("h")) {
            return "标题";
        }else if (text.equals("img")) {
            return "图片";
        }else if (text.equals("table")) {
            return "表格";
        }else {
            return "内容";
        }
    }
    /**
     * 通过传入的文本类型
     * @param type
     * @return
     */
    private static String getFileType(Integer type){
        if (type == 0) {
            return "期刊";
        }else if (type == 1) {
            return "博士论文";
        }else if (type == 2) {
            return "硕士论文";
        }else if (type == 3) {
            return "图书";
        }else if (type == 4) {
            return "研报";
        }else {
            return "期刊";
        }
    }

    /**
     * 判断源数据是否能放入栈中
     * @param source 源数据
     * @param stack 栈数据
     * @return true or false
     */
    private static boolean canPush(Map<String, Object> source, Stack<Map<String, Object>> stack){
        if (stack.isEmpty()) {
            return true;
        }
        String text = (String) source.get("text");
        if (!text.startsWith("<h")) {
            return false;
        }

        Map<String, Object> peek = stack.peek();
        int level = (int) peek.get("level");
        int sourceLevel = (int) source.get("level");

        return sourceLevel > level;
    }

    /**
     * 给文件赋值 parent 属性
     * @param source
     * @param stack
     */
    private static void getParent(Map<String, Object> source, Stack<Map<String, Object>> stack){
        if (stack.isEmpty()) {
            source.put("parent", "0");
        }
        Map<String, Object> peek = stack.peek();
        String id = (String) peek.get("id");
        int level = (int) peek.get("level");

        int sourceLevel = (int) source.get("level");


        if (sourceLevel > level){
            source.put("parent", id);
        } else {
            boolean f = false;
            while (sourceLevel <= level) {
                stack.pop();
                if (stack.isEmpty()) {
                    source.put("parent", "0");
                    f = true;
                    break;
                }
                peek = stack.peek();
                level = (int) peek.get("level");
            }
            if (!f) {
                id = (String) peek.get("id");
                source.put("parent", id);
            }

        }
    }


    /**
     * 通过 h 标签的判断文本层级
     * @param text
     * @return
     */
    private static Integer getLevel(String text) {
        if (StringUtils.isBlank(text)) {
            return null;
        }
        Matcher matcher = Pattern.compile("^(<h1>|<h2>|<h3>|<h4>|<h5>|<h6>|<h7>|<h8>|<h9>|<h1|<h2|<h3|<h4|<h5|<h6|<h7|<h8|<h9)").matcher(text);
        if (matcher.find()) {
            String tag = matcher.group();
            switch (tag) {
                case "<h1>":
                case "<h1":
                    return 1;
                case "<h2>":
                case "<h2":
                    return 2;
                case "<h3>":
                case "<h3":
                    return 3;
                case "<h4>":
                case "<h4":
                    return 4;
                case "<h5>":
                case "<h5":
                    return 5;
                case "<h6>":
                    return 6;
                case "<h7>":
                case "<h7":
                    return 7;
                case "<h8>":
                case "<h8":
                    return 8;
                case "<h9>":
                case "<h9":
                    return 9;
                default:
                    return 100;
            }
        } else {
            return 100;
        }
    }
    public static Elements getDirectory(String text) {
        // 解析HTML字符串
        Document doc = Jsoup.parse(text);
        // 提取所有的p标签、img标签和table标签
        Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
        Elements elements1 = new Elements();
        String hh = "";
        // 打印提取结果、去多余空格

        //获取文章目录标题
        int startflag=0;
        int maxlever = 0;
        Map<String, String> titleLev = new HashMap<String, String>();
        for (Element element : elements) {
            if(element.toString().contains("<img") || element.toString().contains("<table")){
                elements1.add(element);
            }else{
                if(!element.text().equals("")){
                    String parentText = element.text().replaceAll("<p>","").replaceAll("</p>","");
                    String parentText2 = element.text().replaceAll("<p>","").replaceAll("</p>","");
/*                    if (parentText.contains("关键词") && startflag == 0) {
                        startflag = 1;
                    }*/
                    if(!calculateEnglishRatio(parentText)){
                        //判断是不是标题目录
                        if((parentText.matches("[一二三四五六七八九十]+.*") ||
                                parentText.startsWith("(") ||
                                parentText.startsWith("（") ||
                                parentText.matches("\\d+.*") ) &&
                                parentText.length()<50 &&
                                !parentText.startsWith("中图分类号")

                        ){
                            String tKey = getTilteNum(parentText);
                            if (null==tKey || tKey.trim().length()==0) {
                                continue;
                            }

                            //获取目录的层级
                            String lever = titleLev.get(tKey);
                            if (null!=lever ) {
                                parentText = "<h"+lever+">"+parentText+"</h"+lever+">";
                            } else if (titleLev.size()==0) {
                                maxlever++;
                                titleLev.put(tKey,String.valueOf(maxlever));
                                parentText = "<h1>"+parentText+"</h1>";
                            } else {
                                maxlever++;
                                titleLev.put(tKey,String.valueOf(maxlever));
                                parentText = "<h"+maxlever+">"+parentText+"</h"+maxlever+">";

                            }
                            parentText = parentText.replaceAll(" ","").replaceAll("&nbsp;","");

                        }
                        if(parentText.startsWith("<h")){
                            String tag = parentText.substring(1,3);
                            element.tagName(tag);
                            element.html(parentText2);
                        }
                    }
                    elements1.add(element);
                }
            }
        }
        return elements1;
    }


    public static String getTilteNum(String title) {
        String result = "";
        if (Pattern.matches("\\d+", title)) {
            return result ;
        }
        String regex = "\\d+\\D";
        // 创建Pattern对象
        Pattern pattern = Pattern.compile(regex);
        // 创建Matcher对象
        Matcher matcher = pattern.matcher(title);
        if(title.matches("\\d+.*") ) {
            while (matcher.find()) {
                // 获取匹配到的文本
                String resultnum = matcher.group();
                resultnum = resultnum.substring(0, resultnum.length()-1);
                if (resultnum.length()>5) {
                    resultnum = resultnum.substring(0,5);
                }
                Integer num = Integer.valueOf(resultnum);
                if (num>20) {
                    return result ;
                }
                break;
            }


            StringBuilder result1 = new StringBuilder();
            for (char c : title.toCharArray()) {
                if (Character.isDigit(c) || c=='.') {
                    result1.append(c);
                } else {
                    break;
                }
            }

            result = String.valueOf(result1);
            result = result.replaceAll("\\d", "1");
        } else if (title.matches("[一二三四五六七八九十]+.*") ) {
            if (title.contains("、") || title.contains(" ") || title.contains(".")) {
                result = "一";
            }

        } else if (title.startsWith("(") ||
                title.startsWith("（")) {
            if (title.substring(1).matches("\\d+.*") ) {

                Matcher matcher1 = pattern.matcher(title.substring(1));
                while (matcher1.find()) {
                    // 获取匹配到的文本
                    String resultnum = matcher1.group();
                    resultnum = resultnum.substring(0, resultnum.length()-1);
                    if (resultnum.length()>5) {
                        resultnum = resultnum.substring(0,5);
                    }
                    Integer num = Integer.valueOf(resultnum);
                    if (num>20) {
                        return result ;
                    }
                    break;
                }


                result = "(1)";
            } else if (title.substring(1).matches("[一二三四五六七八九十]+.*")) {
                result = "(一)";
            }
        }

        return result ;
    }
    public static void setNoDirectory(XWPFDocument doc, List<String> directoryList) {

        String resulthtml="";
        int i=0;
        for (XWPFParagraph paragraph : doc.getParagraphs()) {
            String text = paragraph.getText().trim().replaceAll(" ","");


            String titleLvl = getTitleLvl(doc, paragraph);
            if (StringUtils.isNotEmpty(titleLvl)) {
                DocEntity docEntity = getDocEntity(paragraph, text, "0");
                if(docEntity==null || StringUtils.isEmpty(docEntity.getName())
//                        || (docEntity.getParentId()!=null && docEntity.getParentId().equals("-1"))
                ){
                    continue;
                }
                // 在父节点的文本前后添加<h1>标签
                String parentText = paragraph.getText().trim().replaceAll(" ","");
                Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
                Matcher matcher = pattern.matcher(parentText);
                Pattern pattern1 = Pattern.compile("\\（[一二三四五六七八九十]+\\）");
                Matcher matcher1 = pattern1.matcher(parentText);
                Pattern pattern2 = Pattern.compile("\\（[一二三四五六七八九十]+\\)");
                Matcher matcher2 = pattern2.matcher(parentText);
                Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\）");
                Matcher matcher3 = pattern3.matcher(parentText);
                if(parentText.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(parentText)) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
                    directoryList.add(docEntity.name);
                }else if (parentText.matches("^\\d+\\.[^\\d].*")) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
                    directoryList.add(docEntity.name);
                }else if (parentText.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
                    directoryList.add(docEntity.name);
                }else if (parentText.matches("\\d+\\.\\d+.*")) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__two1" + parentText + "__space__two2");
                    directoryList.add(docEntity.name);
                }else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__two1" + parentText + "__space__two2");
                    directoryList.add(docEntity.name);
                }else if (parentText.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
                    paragraph.getCTP().setRArray(new CTR[]{});
                    paragraph.createRun().setText("__space__three1" + parentText + "__space__three2");
                    directoryList.add(docEntity.name);
                }
            }
        }
    }

    public static String getwordHtmlOnlyText(XWPFDocument doc) {

        String resulthtml="";
        int i=0;
        for (XWPFParagraph paragraph : doc.getParagraphs()) {
            String text = paragraph.getText().trim();
            if (StringUtils.isNotEmpty(text)) {

                String titleLvl = getTitleLvl(doc, paragraph);
                if (StringUtils.isNotEmpty(titleLvl)) {
                    DocEntity docEntity = getDocEntity(paragraph, text, "0");
                    if(docEntity==null || StringUtils.isEmpty(docEntity.getName()) || text.length()>50){
                        resulthtml = getHtml(resulthtml,text);
                    }else{
                        // 在父节点的文本前后添加<h1>标签
                        Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
                        Matcher matcher = pattern.matcher(text);
                        Pattern pattern1 = Pattern.compile("\\（[一二三四五六七八九十]+\\）");
                        Matcher matcher1 = pattern1.matcher(text);
                        Pattern pattern2 = Pattern.compile("\\（[一二三四五六七八九十]+\\)");
                        Matcher matcher2 = pattern2.matcher(text);
                        Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\）");
                        Matcher matcher3 = pattern3.matcher(text);
                        if(text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text)) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h1>"+text+"</h1>\n";
                        }else if (text.matches("^\\d+\\.[^\\d].*")) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h1>"+text+"</h1>\n";
                        }else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h1>"+text+"</h1>\n";
                        }else if (text.matches("\\d+\\.\\d+.*")) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h2>"+text+"</h2>\n";
                        }else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h2>"+text+"</h2>\n";
                        }else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
                            paragraph.getCTP().setRArray(new CTR[]{});
                            resulthtml += "<h3>"+text+"</h3>\n";
                        }else{
                            resulthtml = getHtml(resulthtml,text);
                        }
                    }
                }else{
                    resulthtml = getHtml(resulthtml,text);
                }
            }
        }
        return resulthtml;
    }


    public static String getwordHtml(XWPFDocument doc) throws IOException {
        String resulthtml="";
        int i=0;
        List<IBodyElement> elements = doc.getBodyElements();

        // 遍历元素
        for (IBodyElement element : elements) {
            // 判断元素类型
            if (element instanceof XWPFParagraph) {
                // 处理文本段落
                XWPFParagraph paragraph = (XWPFParagraph) element;

                // 遍历段落中的所有Run对象
                List<XWPFRun> runs = paragraph.getRuns();
                for (XWPFRun run : runs) {
                    // 获取嵌入的图片
                    List<XWPFPicture> pictures = run.getEmbeddedPictures();
                    for (XWPFPicture picture : pictures) {
                        // 获取图片的二进制数据和文件名
                        XWPFPictureData pictureData = picture.getPictureData();

                        byte[] imageData = pictureData.getData();

                        BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData));
                        int width = image.getWidth();
                        int height = image.getHeight();
                        if (width<80 || height<80) {
                            continue;
                        }


                        String fileName = pictureData.getFileName();

                        // 将图片转换为Base64编码的字符串
                        String base64Image = javax.xml.bind.DatatypeConverter.printBase64Binary(imageData);

                        // 设置HTML代码中的图片类型
                        String mimeType = pictureData.getFileName();
                        String imageType = mimeType.substring(mimeType.lastIndexOf(".") + 1);
                        // 生成HTML代码
                        resulthtml += "<img src=\"data:image/" + imageType + ";base64," + base64Image + "\" alt=\"" + fileName + "\"/>\n";
                    }
                }

                String text = paragraph.getText().trim();
                if (StringUtils.isNotEmpty(text)) {

                    String titleLvl = getTitleLvl(doc, paragraph);
                    if (StringUtils.isNotEmpty(titleLvl)) {
                        DocEntity docEntity = getDocEntity(paragraph, text, "0");
                        if(docEntity==null || StringUtils.isEmpty(docEntity.getName()) || text.length()>50){
                            resulthtml = getHtml(resulthtml,text);
                        }else{
                            // 在父节点的文本前后添加<h1>标签
                            Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
                            Matcher matcher = pattern.matcher(text);
                            Pattern pattern1 = Pattern.compile("\\（[一二三四五六七八九十]+\\）");
                            Matcher matcher1 = pattern1.matcher(text);
                            Pattern pattern2 = Pattern.compile("\\（[一二三四五六七八九十]+\\)");
                            Matcher matcher2 = pattern2.matcher(text);
                            Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\）");
                            Matcher matcher3 = pattern3.matcher(text);
                            if(text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text)) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h1>"+text+"</h1>\n";
                            }else if (text.matches("^\\d+\\.[^\\d].*")) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h1>"+text+"</h1>\n";
                            }else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h1>"+text+"</h1>\n";
                            }else if (text.matches("\\d+\\.\\d+.*")) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h2>"+text+"</h2>\n";
                            }else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h2>"+text+"</h2>\n";
                            }else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
                                paragraph.getCTP().setRArray(new CTR[]{});
                                resulthtml += "<h3>"+text+"</h3>\n";
                            }else{
                                resulthtml = getHtml(resulthtml,text);
                            }
                        }
                    }else{
                        resulthtml = getHtml(resulthtml,text);
                    }
                }
            } else if (element instanceof XWPFTable) {
                // 处理表格
                XWPFTable table = (XWPFTable) element;
                String tableHtml = convertTableToHtml(table);
                resulthtml += tableHtml+"\n";
            }
        }
        return resulthtml;
    }

    public static String getHtml(String resulthtml,String text){
        //判断是不是数字，也就是是不是页码
        String pattern = "\\d+";
        if (Pattern.matches(pattern, text)) {
            return resulthtml;
        }

        //判断是不是汉字
        boolean isHanzi = false;
        for (char c : text.toCharArray()) {
            if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                isHanzi = true;
                break;
            }
        }
        if (isHanzi) {
            text = text.replaceAll(" ","");
        }
        resulthtml += "<p>"+text+"</p>\n";
        return resulthtml;
    }

    private static String convertTableToHtml(XWPFTable table) {
        Document doc = new Document("");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

        Element tableElement = doc.createElement("table");
        doc.appendChild(tableElement);

        // 遍历表格行
        List<XWPFTableRow> rows = table.getRows();
        for (XWPFTableRow row : rows) {
            Element rowElement = doc.createElement("tr");
            tableElement.appendChild(rowElement);

            // 遍历行中的单元格
            List<XWPFTableCell> cells = row.getTableCells();
            for (XWPFTableCell cell : cells) {
                Element cellElement = doc.createElement("td");
                rowElement.appendChild(cellElement);

                // 设置单元格内容
                String cellText = cell.getText();
                cellElement.text(cellText);

                // 设置单元格属性
                int colspan = 1;
                if (cell.getCTTc().getTcPr() != null && cell.getCTTc().getTcPr().getGridSpan() != null) {
                    colspan = cell.getCTTc().getTcPr().getGridSpan().getVal().intValue();
                }

                int rowspan = cell.getCTTc().getTcPr() != null && cell.getCTTc().getTcPr().getVMerge() != null ? 0 : 1;
                cellElement.attr("colspan", String.valueOf(colspan));
                cellElement.attr("rowspan", String.valueOf(rowspan));
            }
        }

        return doc.html();
    }

    /**
     * @param doc
     * @param para
     * @return
     */
    private static String getTitleLvl(XWPFDocument doc, XWPFParagraph para) {
        String titleLvl = "";
        String text = para.getText().trim().replaceAll(" ","");
        try {
            //判断该段落是否设置了大纲级别
            if (para.getCTP().getPPr().getOutlineLvl() != null) {
                return String.valueOf(para.getCTP().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {
        }
        try {
            //判断该段落的样式是否设置了大纲级别
            if (doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl() != null) {
                return String.valueOf(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {
        }
        try {
            //判断该段落的样式的基础样式是否设置了大纲级别
            if (doc.getStyles().getStyle(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal()).getCTStyle().getPPr().getOutlineLvl() != null) {
                String styleName = doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal();
                return String.valueOf(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {

        }
        try {
            if (para.getStyleID() != null) {
                return para.getStyleID();
            }
        } catch (Exception e) {

        }
        Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
        Matcher matcher = pattern.matcher(text);
        Pattern pattern1 = Pattern.compile("\\（[一二三四五六七八九十]+\\）");
        Matcher matcher1 = pattern1.matcher(text);
        Pattern pattern2 = Pattern.compile("\\（[一二三四五六七八九十]+\\)");
        Matcher matcher2 = pattern2.matcher(text);
        Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\）");
        Matcher matcher3 = pattern3.matcher(text);
        if (StringUtils.isNotEmpty(text) && (text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text))) {
            return "1";
        }else if (text.matches("^\\d+\\.[^\\d].*")) {
            return "1";
        }else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
            return "1";
        }else if (StringUtils.isNotEmpty(text) && (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find())) {
            return "2";
        }else if (text.matches("\\d+\\.\\d+.*")) {
            return "2";
        }else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
            return "3";
        }
        return titleLvl;
    }

    /****
     * @return
     */
    private static DocEntity getDocEntity(XWPFParagraph p, String name, String parnetId) {
        DocEntity docEntity = new DocEntity();
        docEntity.setName(name);
        docEntity.setId(UUID.randomUUID().toString().replaceAll("-", ""));
        if (p != null) {
            docEntity.setParentId(String.valueOf(p.getIndentationLeft()));
        } else {
            docEntity.setParentId(parnetId);
        }
        return docEntity;
    }


    public static List<String> extractPText(String htmlString) {
        List<String> pTextList = new ArrayList<>();
        // 使用Jsoup解析HTML字符串
        Document doc = Jsoup.parse(htmlString);
        // 获取所有p标签
        Elements pTags = doc.select("p");
        // 遍历p标签，并将文本内容添加到集合中
        for (Element pTag : pTags) {
            String pText = pTag.text().replaceAll("&nbsp;", "").replaceAll(" ","");
            System.out.println(pText);
            pTextList.add(pText);
        }
        return pTextList;
    }


    public static void formatFile(AiReportScienceFile reportTemplate, String extension, MultipartFile file) throws Exception {
//        //TODO 文件上传
//        obsUtil = GetBeanUtil.getApplicationContext().getBean(ObsUtil.class);
//        //文件路径
//        byte[] bytes = file.getBytes();
//        PutObjectResult putObjectResult = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + "." + extension, bytes);
//        reportTemplate.setFilePathObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult.getObjectKey());
//



//        //预览路径(pdf)
//        String content = reportTemplate.getContent();
//        byte[] pdfBytes = DocUtil.convertDocHtml2Pdf(content, false);
//        PutObjectResult putObjectResult1 = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + ".pdf", pdfBytes);
//        reportTemplate.setPreviewObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult1.getObjectKey());
//        //封面路径(png)
//        byte[] pngBytes = DocUtil.convertDocHtml2Png(content, false);
//        PutObjectResult putObjectResult2 = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + ".png", pngBytes);
//        reportTemplate.setCoverObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult2.getObjectKey());
    }

    public static List<String> beautifyContent(String content) throws Exception {
        List<String> contentList = new ArrayList<>();
        // 解析HTML字符串
        Document doc = Jsoup.parse(content);
        // 提取所有的p标签、img标签和table标签
        Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
        Elements elements1 = new Elements();
        String hh = "";
        // 打印提取结果、去多余空格
        for (Element element : elements) {
            if(element.toString().contains("<img") || element.toString().contains("<table")){
                elements1.add(element);
            }else{
                if(!element.text().equals("") && !chunshuzi(element.text()) && !element.text().contains("http://www.cnki.net")
                        && (!element.text().contains("(c)") && !element.text().contains("china"))){
                    if(!calculateEnglishRatio(element.text())){
                        String text1 = element.text().replaceAll(" ","").replaceAll("&nbsp;","");
                        element.text(text1);
                    }
                    elements1.add(element);
                }
            }
        }
        //优化段落
        Boolean flag = true;
        String text2 = "";
        Iterator<Element> iterator = elements1.iterator();
        while(iterator.hasNext()) {
            Element element = iterator.next();
            String str = element.toString();
            if(str.contains("<p") && str.contains("</p>") && !str.contains("<table")){
                element.clearAttributes();
                element.attr("style","font-size:12pt;text-indent:2em");
                str = element.toString();
            }
            if(str.contains("<h") && str.contains("</h")){
                flag = false;
            }
            //先拼接正文前数据
            if(flag){
                hh = hh + element;
                contentList.add(str);
            }else{
                //开始优化段落内容
                if(str.contains("<p") && str.contains("</p>") && !str.contains("<table") && !element.text().equals("") && !cankaowenxian(element.text().replaceAll(" ",""))){
                    String ll = element.text();
                    if(!calculateEnglishRatio(element.text())){
                        if(!element.text().endsWith("。") && !isDigit(element.text()) && !element.text().startsWith("关键词")
                                && !element.text().startsWith("目录") && !element.text().contains("参考文献")){
                            text2 = text2 + element.text();
                            iterator.remove();
                        }else{
                            if(!text2.equals("")){
                                text2 = text2 + element.text();
                                element.clearAttributes();
                                element.attr("style","font-size:12pt;text-indent:2em");
                                element.text(text2);
                                text2 = "";
                                hh = hh + element;
                                contentList.add(element.toString());
                            }else{
                                hh = hh + element;
                                contentList.add(element.toString());
                            }
                        }
                    }else{
                        hh = hh + element;
                        contentList.add(element.toString());
                    }
                }else{
                    hh = hh + element;
                    contentList.add(element.toString());
                }
            }
        }
        return contentList;
    }




    public static List<String> beautifyContent2(String text) throws Exception {
        // 解析HTML字符串
        Document doc = Jsoup.parse(text);
        // 提取所有的p标签、img标签和table标签
        Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
        List<String> contentList = new ArrayList<>();

        Elements elements1 = new Elements();
        String hh = "";
        // 打印提取结果、去多余空格
        for (Element element : elements) {
            if(element.toString().contains("<img") || element.toString().contains("<table")){
                elements1.add(element);
            }else{
/*                if(!element.text().equals("") && !chunshuzi(element.text()) && !element.text().contains("http://www.cnki.net")
                        && (!element.text().startsWith("(c)") && !element.text().contains("china"))){*/
                if(!element.text().equals("") && !chunshuzi(element.text())
                        && (!element.text().startsWith("(c)"))&& (!element.text().startsWith("(C)"))&& (!element.text().startsWith("作者简介"))
                        && (!element.text().startsWith("收稿日期"))){
                    if(!calculateEnglishRatio(element.text())){
                        String text1 = element.text().replaceAll(" ","").replaceAll("&nbsp;","");
                        element.text(text1);
                    }
                    elements1.add(element);
                }
            }
        }
        //优化段落
        Boolean flag = true;
        String text2 = "";
        Iterator<Element> iterator = elements1.iterator();
        while(iterator.hasNext()) {
            Element element = iterator.next();
            String str = element.toString();
            if(str.contains("<p") && str.contains("</p>") && !str.contains("<table")){
                element.clearAttributes();
                element.attr("style","font-size:12pt;text-indent:2em");
                str = element.toString();
            }
            if(str.contains("<h") && str.contains("</h")){
                flag = false;
            }
            //先拼接正文前数据
            if(flag){
                //如果text2不为空，也就是之前有数据，那么优先加入之前数据
                if (StringUtils.isNotEmpty(text2)) {
                    text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
                    contentList.add(text2);
                    text2 = "";
                }
                hh = hh + element;
                contentList.add(str);
            }else{
                //开始优化段落内容
                if(str.contains("<p") && str.contains("</p>") && !str.contains("<table") && !element.text().equals("") && !cankaowenxian(element.text().replaceAll(" ",""))){
                    String ll = element.text();
                    if(!calculateEnglishRatio(element.text())){
                        if(!element.text().endsWith("。") && !isDigit(element.text()) && !element.text().startsWith("关键词")
                                && !element.text().startsWith("目录") && !element.text().contains("参考文献") && !element.text().endsWith("页)")
                                && !element.text().endsWith("页）")){
                            text2 = text2 + element.text();
                            iterator.remove();
                        }else{
                            if(!text2.equals("")){
                                text2 = text2 + element.text();
                                element.clearAttributes();
                                element.attr("style","font-size:12pt;text-indent:2em");
                                element.text(text2);
                                text2 = "";
                                hh = hh + element;
                                contentList.add(element.toString());
                            }else{
                                hh = hh + element;
                                contentList.add(element.toString());
                            }
                        }
                    }else{
                        //如果text2不为空，也就是之前有数据，那么优先加入之前数据
                        if (StringUtils.isNotEmpty(text2)) {
                            text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
                            contentList.add(text2);
                            text2 = "";
                        }
                        hh = hh + element;
                        contentList.add(element.toString());
                    }
                }else{
                    //如果text2不为空，也就是之前有数据，那么优先加入之前数据
                    if (StringUtils.isNotEmpty(text2)) {
                        text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
                        contentList.add(text2);
                        text2 = "";
                    }
                    hh = hh + element;
                    contentList.add(element.toString());
                }
            }
        }
        return contentList;
    }


    // 判断英文字符的比重是否大于百分之50
    public static boolean calculateEnglishRatio(String  str) {
        int englishCount = 0;
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            if (isEnglish(c)) {
                englishCount++;
            }
        }
        double englishRatio = (double) englishCount / str.length();
        return englishRatio > 0.5;
    }
    // 判断字符是否为英文字符
    public static boolean isEnglish(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    }

    // 判断英文字符的比重是否大于百分之50
    public static boolean isDigit (String str) {
        char lastChar = str.charAt(str.length() - 1);
        boolean isDigit = Character.isDigit(lastChar);
        return isDigit;
    }

    public static void main(String[] args) {
        String aa="［1］张维迎.所有制、治理结构及委托—代理关系：兼评崔之元和周其仁的一些观点［J］.经济研究,1996(9):3-15,53.";
        System.out.println(cankaowenxian(aa));
    }
    public static boolean cankaowenxian (String str) {
        Pattern pattern = Pattern.compile("^\\[[1234567890]+\\]");
        Matcher matcher = pattern.matcher(str);
        Pattern pattern1 = Pattern.compile("^\\［[1234567890]+\\］");
        Matcher matcher1 = pattern1.matcher(str);
        if(matcher.find() || matcher1.find()){
            return true;
        }else{
            return false;
        }
    }
    public static boolean chunshuzi (String str) {
        str = str.replaceAll("·","").replaceAll(" ","");
        Pattern pattern = Pattern.compile("^\\d+$");
        Matcher matcher = pattern.matcher(str);
        return matcher.find();
    }


    /*
    public static void sendKafka(List<AiReportScienceFileMaterial> list, String fileId, Integer fileType, String origin, String status){
        streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
        log.debug("期刊论文素材推送kafka开始======");
        for(AiReportScienceFileMaterial obj:list){
            if(StringUtils.isNotEmpty(obj.getContent()) && obj.getContent().contains("<p") && obj.getContent().contains("</p>")
            && countChineseCharacters(Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())))<36){
                continue;
            }
            //推送kafka
            JSONObject jo = new JSONObject();
            jo.put("origin", origin);
            jo.put("fileId", fileId);
            jo.put("status", status);
            jo.put("fileType", getFileType(fileType));
            jo.put("textId", obj.getId());
            jo.put("text", Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())));
            jo.put("textType", getTextType(obj.getContentType()));
            streamBridge.send("science_file", jo);
            log.debug("推送成功，段落id："+obj.getId());
        }
    }

    public static void sendKafka2(List<AiReportScienceFileMaterial> list, String fileId, Integer fileType,String origin,String status){
        streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
        log.debug("期刊论文素材推送kafka开始======");
        for(AiReportScienceFileMaterial obj:list){
            if(StringUtils.isNotEmpty(obj.getContent()) && obj.getContent().contains("<p") && obj.getContent().contains("</p>")
                    && countChineseCharacters(Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())))<36){
                continue;
            }
            //推送kafka
            JSONObject jo = new JSONObject();
            jo.put("origin", origin);
            jo.put("fileId", fileId);
            jo.put("status", status);
            jo.put("fileType", getFileType(fileType));
            jo.put("textId", obj.getId());
            jo.put("text", Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())));
            jo.put("textType", getTextType(obj.getContentType()));
            streamBridge.send("science_file_1219", jo);
            log.debug("推送成功，段落id："+obj.getId());
        }
    }
*/



    /**
     * 推送审核后需要删除的textId
     */
    /*
    public static void sendKafka(List<String> contentIds){
        try {
            streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
            log.debug("期刊论文素材审核删除段落开始推送======");
            for (String contentId : contentIds) {
                if (StringUtils.isNotBlank(contentId)) {
                    //推送kafka
                    JSONObject jo = new JSONObject();
                    jo.put("textId", contentId);
                    streamBridge.send("delete_science_material", jo);
                    log.debug("推送成功，段落id：{}", contentId);
                }
            }
        } catch (BeansException e) {
            log.error("推送需删除的数据失败：{}", e.getMessage());
        }
    }
*/

    public static int countChineseCharacters(String str) {
        Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]");
        Matcher matcher = pattern.matcher(str);
        int count = 0;
        while (matcher.find()) {
            count++;
        }
        return count;
    }
}
