提交 45522ebd 作者: ZhangJingKun

素材管理期刊论文模块代码

上级
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.5/apache-maven-3.9.5-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.3.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.zzsn</groupId>
<artifactId>know-base</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>know-base</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<!--mybatis plus-->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.21</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>
<!-- json-->
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.25</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.83</version>
</dependency>
<!-- hutool工具类-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.3.8</version>
</dependency>
<!-- jsoup html解析 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- AutoPoi Excel工具类-->
<dependency>
<groupId>org.jeecgframework</groupId>
<artifactId>autopoi-web</artifactId>
<version>1.2.5</version>
<exclusions>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- httpclient-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<!-- redis-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
</dependency>
<!-- 华为obs-->
<!-- https://mvnrepository.com/artifact/com.huaweicloud/esdk-obs-java-bundle -->
<dependency>
<groupId>com.huaweicloud</groupId>
<artifactId>esdk-obs-java-bundle</artifactId>
<version>3.22.12</version>
</dependency>
<dependency>
<groupId>com.github.tobato</groupId>
<artifactId>fastdfs-client</artifactId>
<version>1.26.1-RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
package com.zzsn.knowbase;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class KnowBaseApplication {
public static void main(String[] args) {
SpringApplication.run(KnowBaseApplication.class, args);
}
}
package com.zzsn.knowbase.constant;
public interface CommonConstant {
/**
* 正常状态
*/
public static final Integer STATUS_NORMAL = 0;
/**
* 禁用状态
*/
public static final Integer STATUS_DISABLE = -1;
/**
* 删除标志
*/
public static final Integer DEL_FLAG_1 = 1;
/**
* 未删除
*/
public static final Integer DEL_FLAG_0 = 0;
/**
* 系统日志类型: 登录
*/
public static final int LOG_TYPE_1 = 1;
/**
* 系统日志类型: 操作
*/
public static final int LOG_TYPE_2 = 2;
/**
* 操作日志类型: 查询
*/
public static final int OPERATE_TYPE_1 = 1;
/**
* 操作日志类型: 添加
*/
public static final int OPERATE_TYPE_2 = 2;
/**
* 操作日志类型: 更新
*/
public static final int OPERATE_TYPE_3 = 3;
/**
* 操作日志类型: 删除
*/
public static final int OPERATE_TYPE_4 = 4;
/**
* 操作日志类型: 倒入
*/
public static final int OPERATE_TYPE_5 = 5;
/**
* 操作日志类型: 导出
*/
public static final int OPERATE_TYPE_6 = 6;
/** {@code 500 Server Error} (HTTP/1.0 - RFC 1945) */
public static final Integer SC_INTERNAL_SERVER_ERROR_500 = 500;
/** {@code 200 OK} (HTTP/1.0 - RFC 1945) */
public static final Integer SC_OK_200 = 200;
/**访问权限认证未通过 510*/
public static final Integer SC_JEECG_NO_AUTHZ=510;
/** 登录用户Shiro权限缓存KEY前缀 */
public static String PREFIX_USER_SHIRO_CACHE = "shiro:cache:org.jeecg.config.shiro.ShiroRealm.authorizationCache:";
/** 登录用户Token令牌缓存KEY前缀 */
public static final String PREFIX_USER_TOKEN = "prefix_user_token_";
/** Token缓存时间:3600秒即一小时 */
public static final int TOKEN_EXPIRE_TIME = 3600;
/*
* 登录次数间隔
*/
public static final long LOGIN_ERROR_MAX_TIME = 10 * 60 * 1000;
/**
* 0:一级菜单
*/
public static final Integer MENU_TYPE_0 = 0;
/**
* 1:子菜单
*/
public static final Integer MENU_TYPE_1 = 1;
/**
* 2:按钮权限
*/
public static final Integer MENU_TYPE_2 = 2;
/**通告对象类型(USER:指定用户,ALL:全体用户)*/
public static final String MSG_TYPE_UESR = "USER";
public static final String MSG_TYPE_ALL = "ALL";
/**发布状态(0未发布,1已发布,2已撤销)*/
public static final String NO_SEND = "0";
public static final String HAS_SEND = "1";
public static final String HAS_CANCLE = "2";
/**阅读状态(0未读,1已读)*/
public static final String HAS_READ_FLAG = "1";
public static final String NO_READ_FLAG = "0";
/**优先级(L低,M中,H高)*/
public static final String PRIORITY_L = "L";
public static final String PRIORITY_M = "M";
public static final String PRIORITY_H = "H";
/**
* 短信模板方式 0 .登录模板、1.注册模板、2.忘记密码模板
*/
public static final String SMS_TPL_TYPE_0 = "0";
public static final String SMS_TPL_TYPE_1 = "1";
public static final String SMS_TPL_TYPE_2 = "2";
/**
* 状态(0无效1有效)
*/
public static final String STATUS_0 = "0";
public static final String STATUS_1 = "1";
/**
* 同步工作流引擎1同步0不同步
*/
public static final Integer ACT_SYNC_1 = 1;
public static final Integer ACT_SYNC_0 = 0;
/**
* 消息类型1:通知公告2:系统消息
*/
public static final String MSG_CATEGORY_1 = "1";
public static final String MSG_CATEGORY_2 = "2";
/**
* 是否配置菜单的数据权限 1是0否
*/
public static final Integer RULE_FLAG_0 = 0;
public static final Integer RULE_FLAG_1 = 1;
/**
* 是否用户已被冻结 1正常(解冻) 2冻结
*/
public static final Integer USER_UNFREEZE = 1;
public static final Integer USER_FREEZE = 2;
/**字典翻译文本后缀*/
public static final String DICT_TEXT_SUFFIX = "_dictText";
/**
* 表单设计器主表类型
*/
public static final Integer DESIGN_FORM_TYPE_MAIN = 1;
/**
* 表单设计器子表表类型
*/
public static final Integer DESIGN_FORM_TYPE_SUB = 2;
/**
* 表单设计器URL授权通过
*/
public static final Integer DESIGN_FORM_URL_STATUS_PASSED = 1;
/**
* 表单设计器URL授权未通过
*/
public static final Integer DESIGN_FORM_URL_STATUS_NOT_PASSED = 2;
/**
* 表单设计器新增 Flag
*/
public static final String DESIGN_FORM_URL_TYPE_ADD = "add";
/**
* 表单设计器修改 Flag
*/
public static final String DESIGN_FORM_URL_TYPE_EDIT = "edit";
/**
* 表单设计器详情 Flag
*/
public static final String DESIGN_FORM_URL_TYPE_DETAIL = "detail";
/**
* 表单设计器复用数据 Flag
*/
public static final String DESIGN_FORM_URL_TYPE_REUSE = "reuse";
/**
* 表单设计器编辑 Flag (已弃用)
*/
public static final String DESIGN_FORM_URL_TYPE_VIEW = "view";
/**
* online参数值设置(是:Y, 否:N)
*/
public static final String ONLINE_PARAM_VAL_IS_TURE = "Y";
public static final String ONLINE_PARAM_VAL_IS_FALSE = "N";
/**
* 文件上传类型(本地:local,Minio:minio,阿里云:alioss)
*/
public static final String UPLOAD_TYPE_LOCAL = "local";
public static final String UPLOAD_TYPE_MINIO = "minio";
public static final String UPLOAD_TYPE_OSS = "alioss";
/**
* 文档上传自定义桶名称
*/
public static final String UPLOAD_CUSTOM_BUCKET = "eoafile";
/**
* 文档上传自定义路径
*/
public static final String UPLOAD_CUSTOM_PATH = "eoafile";
/**
* 文件外链接有效天数
*/
public static final Integer UPLOAD_EFFECTIVE_DAYS = 1;
/**
* 员工身份 (1:普通员工 2:上级)
*/
public static final Integer USER_IDENTITY_1 = 1;
public static final Integer USER_IDENTITY_2 = 2;
/** sys_user 表 username 唯一键索引 */
public static final String SQL_INDEX_UNIQ_SYS_USER_USERNAME = "uniq_sys_user_username";
/** sys_user 表 work_no 唯一键索引 */
public static final String SQL_INDEX_UNIQ_SYS_USER_WORK_NO = "uniq_sys_user_work_no";
/** sys_user 表 phone 唯一键索引 */
public static final String SQL_INDEX_UNIQ_SYS_USER_PHONE = "uniq_sys_user_phone";
/** sys_user 表 email 唯一键索引 */
public static final String SQL_INDEX_UNIQ_SYS_USER_EMAIL = "uniq_sys_user_email";
/** sys_quartz_job 表 job_class_name 唯一键索引 */
public static final String SQL_INDEX_UNIQ_JOB_CLASS_NAME = "uniq_job_class_name";
/** sys_position 表 code 唯一键索引 */
public static final String SQL_INDEX_UNIQ_CODE = "uniq_code";
/** sys_role 表 code 唯一键索引 */
public static final String SQL_INDEX_UNIQ_SYS_ROLE_CODE = "uniq_sys_role_role_code";
/** sys_depart 表 code 唯一键索引 */
public static final String SQL_INDEX_UNIQ_DEPART_ORG_CODE = "uniq_depart_org_code";
/**
* 在线聊天 是否为默认分组
*/
public static final String IM_DEFAULT_GROUP = "1";
/**
* 在线聊天 图片文件保存路径
*/
public static final String IM_UPLOAD_CUSTOM_PATH = "imfile";
/**
* 在线聊天 用户状态
*/
public static final String IM_STATUS_ONLINE = "online";
/**
* 在线聊天 SOCKET消息类型
*/
public static final String IM_SOCKET_TYPE = "chatMessage";
/**
* 在线聊天 是否开启默认添加好友 1是 0否
*/
public static final String IM_DEFAULT_ADD_FRIEND = "1";
/**
* 在线聊天 用户好友缓存前缀
*/
public static final String IM_PREFIX_USER_FRIEND_CACHE = "sys:cache:im:im_prefix_user_friend_";
/*
* 重复登录次数,用户缓存前缀
*/
public static final String LOGIN_PREFIX_USERNAME = "sys:cache:username_";
/**
* 考勤补卡业务状态 (1:同意 2:不同意)
*/
public static final String SIGN_PATCH_BIZ_STATUS_1 = "1";
public static final String SIGN_PATCH_BIZ_STATUS_2 = "2";
/**
* 公文文档上传自定义路径
*/
public static final String UPLOAD_CUSTOM_PATH_OFFICIAL = "officialdoc";
/**
* 公文文档下载自定义路径
*/
public static final String DOWNLOAD_CUSTOM_PATH_OFFICIAL = "officaldown";
/**
* WPS存储值类别(1 code文号 2 text(WPS模板还是公文发文模板))
*/
public static final String WPS_TYPE_1="1";
public static final String WPS_TYPE_2="2";
public final static String X_ACCESS_TOKEN = "X-Access-Token";
/**
* 多租户 请求头
*/
public final static String TENANT_ID = "tenant-id";
/**
* 微服务读取配置文件属性 服务地址
*/
public final static String CLOUD_SERVER_KEY = "spring.cloud.nacos.discovery.server-addr";
/**
* 第三方登录 验证密码/创建用户 都需要设置一个操作码 防止被恶意调用
*/
public final static String THIRD_LOGIN_CODE = "third_login_code";
//用户-超级用户:拥有所有数据权限
public static final Integer SUPER_USER = 1;
//错误登录次数在字典表中对应的id
public static final String LOGIN_ERR_TIME_DICT_ITEMT_ID = "1694682487099117570";
public static final String PWD_MODIFY_TIME_DICT_ITEMT_ID = "1694682745728290818";
public static final String PWD_MODIFY_ALARM_DICT_ITEMT_ID = "1694683114650882049";
//用户-普通用户
public static final Integer COMMON_USER = 2;
//用户-管理员用户
public static final Integer ADMIN_USER = 3;
/*数据权限类型*/
public final static String PERMISSION_PROJECT = "project";
public final static String PERMISSION_SUBJECT = "subject";
public final static String PERMISSION_GROUP = "group";
public final static String PERMISSION_KEYOORD = "keyword";
public final static String PERMISSION_COLUM = "channel";
}
package com.zzsn.knowbase.constant;
import java.util.HashMap;
import java.util.Map;
public class Constants {
//redis缓存 key常量
public static final String FINANCE = "SAVE_ES:";
public static final String RESEARCH = "RESEARCH:";
//旧同步数据时索引
public final static String ES_DATA_NEW_INDEX = "newclbdatabase";
//评价中心es索引
public static final String GZJG_ES_INDEX = "gzjg";
//新采集库(22.04.23)
public final static String ES_BASE_DATA = "basedata_2023";
//新领导讲话索引(22.08.24)
public final static String LEADER_SPEECH_BASE_DATA = "leaderspeech_new";
//新专题库(22.04.24)
public final static String ES_SUBJECT_DATA = "subjectdata";
//审计对接旧索引(废弃)
public final static String ES_SUBJECT_DEV_DATA = "subjectdatabase_dev";
//处理后的专题资讯信息存储索引。
public final static String ES_DATA_FOR_SUBJECT = "subjectdatabase_2023";
/*
* 专题内容主次关系索引
*/
public static final String SUB_PRIMARY_SECONDARY_MAP = "subprimarysecondarymap";
public final static String ES_DATA_FOR_SUBJECT_YJZX = "yjzx_subjectdatabase_2023";
//研究报告索引
public final static String RESEARCH_REPORT_DATA = "researchreportdata_2023";
//政策法规
public final static String POLICY_DATA = "policy";
//素材库索引
public final static String ES_DATA_FOR_AIREPORTMATERIAL = "ai_report_material";
//新建的es索引
public final static String ES_DATA_FOR_COLLECTION = "collectiondatabase_dev";
//法规库
public final static String ES_DATA_FOR_FAGUI = "knowledgebasefg";
//文件上传路径
public final static String FILE_PATH_PREFIX = "static/uploadFiles";
//验证服务占用记录
public final static String VERIFY_SERVICE = "VERIFY_SERVICE:";
//专题采集数据
public final static String SUBJECT_DATA = "SUBJECT_DATA:COLLECT:";
/*
* 政策法规向数据库中同步
*/
public final static String POLICY_INFO_DB_SYN = "POLICY:DB:";
//python文件上床地址(李孟对接)
public final static String PYTHON_FILE_PATH = "D:/FTP";
//爬取到的网页类型
public final static String TYPE_HTML = "HTML";
public final static String TYPE_EXCEL = "EXCEL";
public final static String TYPE_WORD = "WORD";
public final static String TYPE_PPT = "PPT";
public final static String TYPE_PDF = "PDF";
public final static String TYPE_IMG = "IMG";
public final static String TYPE_FILE = "FILE";
public final static String DEFAULT_LANG = "cn";
public final static String YDYL_ARTICLE_SUBJECT_ID = "1653954370072371202";
public final static String LABEL_TYPE_ARTICLE_TYPE_ID = "1651433081113829378";
public final static String LABEL_TYPE_PROJECT_TYPE_ID = "1651428096456155138";
public final static String LABEL_TYPE_BUSINESS_TYPE_ID = "1651428583591010305";
public final static String LABEL_TYPE_RISK_TYPE_ID = "1651428400094404610";
//专题分析 关键词类型ID
public static final String KEYWORDS_TYPE_ID = "1545323639679737857";
//专题分析 专题类型ID
public static final String SUBJECT_TYPE_ID = "1545666456444723202";
//项目ID
public static final String PROJECT_ID = "1545303889230684162";
//专题对应的id
public static final String POLICY_SUBJECT_ID = "1660551578301423617";
//专题分析 绑定搜素引擎ID
public static final String[] SEARCH_ENGINES_ID = new String[]{"1539145903551361026", "1539146006513135617", "1539146115049140225", "1539146210310172674", "1539146330581839873", "1539146509640871938", "1539146774620221441", "1539146892010401794"};
//信息源网址入缓存前缀
public static final String SITE_NAME_KEY = "SITE_NAME_KEY:";
//信息源网址入缓存前缀
public static final int SUMMARY_MAX_LENGTH = 260;
//专题数据提取新词时间节点
public static final String SUBJECT_NEW_KEYWORDS_TIME = "SUBJECT_LAST_NEW_KEYWORDS_TIME:";
//专题数据提取热词时间节点
public static final String SUBJECT_HOT_KEYWORDS_TIME = "SUBJECT_LAST_HOT_KEYWORDS_TIME:";
//全量数据时间节点
public static final String COMBINE_FULL_DATA_TASK_TIME = "COMBINE_FULL_DATA_TASK_TIME:";
//三元组入库时间节点
public static final String TRIPLET_TO_NEO_TASK_TIME = "TRIPLET_TO_NEO_TASK_TIME:";
//前台-热词集合redis缓存key
public final static String HOT_WORD_KEY = "FRONT::HOT_WORD";
//前台-热词趋势redis缓存key
public final static String HOT_WORD_TREND_KEY = "FRONT::HOT_WORD_TREND";
public static final String FSP = System.getProperty("file.separator");
public static final String USER_HOME = System.getProperty("user.home");
public static final String APPLICATION_DATA_DIR = USER_HOME + FSP + "mrasdata" + FSP;
//专题采集数据
public final static String SEND_TO_MACHINE_DATA = "SUBJECT_DATA:SYNtOMACHINE:";
//报告任务分组 推送报告任务(包括邮箱和微信企业号)和生成报告任务
public static final String PUSH_EMAIL_GROUP = "PUSH_EMAIL";
public static final String PUSH_WECHAT_GROUP = "PUSH_WECHAT";
public static final String CREATE_GROUP = "CREATE";
//fastdfs 文件浏览地址 前部分
public static final String FILE_PATH_URL_PREFIX = "http://114.115.215.96/";
//obs 文件浏览地址 前部分
public static final String OBS_FILE_PATH_URL_PREFIX = "https://zzsn.luyuen.com/";
public static final String OBS_FILE_PATH_URL_PREFIX_NOS = "http://zzsn.luyuen.com/";
public static final String OBS_FILE_PATH_URL_HOST = "https://zzsn.obs.cn-north-1.myhuaweicloud.com:443/";
//情报同步最大的id
public final static String SPECIAL_INFORMATION_SYN = "SPECIAL_DATA_MAX_ID";
/*
* 情报语言信息同步
*/
public final static String SPECIAL_LANGUAGE_SYN = "SPECIAL_LANGUAGE_MAX_ID";
/*
* 项目信息同步的最大id
*/
public final static String PROJECT_INFO_SYN = "PROJECT_INFO_MAX_ID";
//专题数据提取图谱信息时间节点
public static final String ARTICLE_TAR_TIME = "ARTICLE_TAR_TIME:";
/*
* 政策法规信息同步最大id
*/
public final static String POLICY_INFO_SYN = "POLICY_INFO_MAX_ID";
/*
* 数据索引全
*/
public static Map<String,String> ALL_INDEX = new HashMap<>();
/*
* 政策法规标签
*/
public static Map<String,String> POLICY_LABELS = new HashMap<>();
public static final String POLICY_LABELSLIST = "1704428137399386114,1704429336999690242,1704428299907694593,1704427938249637890,1714102959951728642";
public static final String SCIENCE_FILE = "摘要,Abstract,参考文献,致谢,结论,参考文献:,主要参考文献";
/*
* 行业的映射关系
*/
public final static Map<String, String> industryMap = new HashMap<String, String>();
static {
industryMap.put("25634569604562944", "1493167501570842625");
industryMap.put("25634563304718336", "1493168560984924161");
industryMap.put("25634564323934208", "1493168791227047938");
industryMap.put("25634562709127168", "1493168919421755394");
industryMap.put("25634567461273600", "1493169086308917250");
industryMap.put("1479035335737290754", "1493481192505847810");
industryMap.put("25634566303645696", "1631520486049972226");
industryMap.put("1479035618374660098", "1631521136611717121");
industryMap.put("1479035799887360001", "1631521522051469314");
industryMap.put("1479035948546076674", "1631521910020435970");
industryMap.put("1479036104142172162", "1631522617045897217");
industryMap.put("1479036223038107649", "1631523006419935233");
industryMap.put("1479036475275161601", "1631523443135049729");
industryMap.put("25634570133045248", "1631523835390464002");
industryMap.put("1479036714153357314", "1631524152807002113");
industryMap.put("1479036863562854402", "1631524581930536962");
industryMap.put("1479036996182552578", "1631524634367725569");
industryMap.put("1479037230665117697", "1631525020126228482");
industryMap.put("1479037374659768321", "1631526287150006274");
industryMap.put("1479037472202502145", "1631526704474808321");
ALL_INDEX.put("subjectdatabase_2023","");//新平台专题库
ALL_INDEX.put("articlebase","");//研究中心资讯库
ALL_INDEX.put("knowledgebasezx","");//研究资讯
ALL_INDEX.put("knowledgebasefg","");//政策法规
ALL_INDEX.put("knowledgebaseqy","");//企业资讯
ALL_INDEX.put("knowledgebasezk","");//智库资讯
ALL_INDEX.put("knowledgebasejh","");//领导讲话
ALL_INDEX.put("researchreportdata_2023","");//新平台研报
ALL_INDEX.put("policy","");//政策法规
POLICY_LABELS.put("1704428137399386114","1642");//栏目id-政策标签id(国务院国资委)
POLICY_LABELS.put("1704429336999690242","1666");//栏目id-政策标签id(地方国资委)
POLICY_LABELS.put("1704428299907694593","1699");//栏目id-政策标签id(国务院各部委文件/其他部门)
POLICY_LABELS.put("1704427938249637890","1766");//栏目id-政策标签id(国务院文件)
}
/*
* 模型日志类型
*/
public static final String MODEL_LOG_TYPE_ADD = "创建";
public static final String MODEL_LOG_TYPE_SET_CONF = "设置配置信息";
public static final String MODEL_LOG_TYPE_UPLOAD_YULIAO = "语料上传";
public static final String MODEL_LOG_TYPE_TRAIN = "模型训练";
public static final String MODEL_LOG_TYPE_TEST = "模型测试";
public static final String MODEL_LOG_TYPE_PUBLISH = "模型发布";
/*
* 模型参数字典表键值
*/
public static final String MODEL_PARAMTER_DICT_ID = "1658713610162970626";
//研究中心-研究报告
public final static String SUBJECT_REPORT_KEYWORDS = "SUBJECT_REPORT_KEYWORDS:";
//研究中心-研究报告处理时间节点
public final static String SUBJECT_REPORT_TIME = "SUBJECT_REPORT_TIME:";
//招投标索引
public final static String TENDER = "tender";
}
package com.zzsn.knowbase.constant;
import java.util.Arrays;
import java.util.List;
/**
* Description:
* Author: EDY
* Date: 2023/10/9
*/
public enum DirEnum {
QYYearReport("QYYearReport/","企业年报","1"),
QYQuarterReport("QYQuarterReport/","企业季报","2"),
QYMonthReport("QYMonthReport/","企业月报","3"),
QYResearchReport("QYResearchReport/","企业研报","4"),
HYResearchReport("HYResearchReport/","行业研报","5"),
SubjectAtt("SubjectAtt/","专题上的附件","6"),
PolicyDocuments("PolicyDocuments/","政策文件附件","7"),
QYNotice("QYNotice/","企业公告","8"),
GZNotice3("GZNotice-3/","股转公告 新三版","9"),
GPSH3("GPSH-3/","挂牌审核 新三版","10"),
ZLJGCS3("ZLJGCS-3/","自律监管措施 新三版","11"),
WXH3("WXH-3/","问询函 新三版","12"),
JLCF3("JLCF-3/","纪律处分 新三版","13"),
MXYL("MXYL/","模型语料","14"),
SubjectUp("SubjectUp/","专题上传","15"),
KLBImage("KLBImage/","克虏宝企业log图片","16"),
REPORT("report/","生成的报告","17"),
REPORT_TEMPLATE("report/template/","生成的报告模板","18"),
SCIENCE_FILE("report/reportScience/","智能写作_期刊/论文/图书/研报","19"),
;
public static List<DirEnum> getAll(){
DirEnum[] values = DirEnum.values();
return Arrays.asList(values);
}
public static DirEnum getByPath(String path) {
for (DirEnum enumValue : DirEnum.values()) {
if (enumValue.getPath().equals(path)) {
return enumValue;
}
}
throw new IllegalArgumentException("No enum constant with value: " + path);
}
public static DirEnum getByDes(String des) {
for (DirEnum enumValue : DirEnum.values()) {
if (enumValue.getDes().equals(des)) {
return enumValue;
}
}
throw new IllegalArgumentException("No enum constant with value: " + des);
}
public static DirEnum getByType(String type) {
for (DirEnum enumValue : DirEnum.values()) {
if (enumValue.getType().equals(type)) {
return enumValue;
}
}
throw new IllegalArgumentException("No enum constant with value: " + type);
}
/**路径名称*/
private final String path;
/**路径描述*/
private final String des;
/**路径描述*/
private final String type;
public String getType() {
return type;
}
public String getPath() {
return path;
}
public String getDes() {
return des;
}
DirEnum(String path, String des , String type) {
this.path = path;
this.des = des;
this.type = type;
}
}
package com.zzsn.knowbase.controller;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.IORuntimeException;
import cn.hutool.http.HtmlUtil;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import com.zzsn.knowbase.service.IAiReportScienceFileMaterialService;
import com.zzsn.knowbase.service.IAiReportScienceFileService;
import com.zzsn.knowbase.vo.AiReportScienceFileVo;
import com.zzsn.knowbase.vo.Result;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.util.MultiValueMap;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.MultipartHttpServletRequest;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import java.io.File;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* @Version 1.0
* @Author: ZhangJingKun
* @Date: 2023/12/25 10:33
* @Content:
*/
@RestController
@Slf4j
@RequestMapping("/knowbase/manage")
public class KnowBaseManageController {
@Resource
private IAiReportScienceFileService aiReportScienceFileService;
@Resource
private IAiReportScienceFileMaterialService aiReportScienceFileMaterialService;
private String kkfileUrl = "http://114.116.43.184:8012/onlinePreview?officePreviewType=pdf&tifPreviewType=jpg&url=";//默认显示pdf
/**
* 测试方法
* @return
*/
@RequestMapping("hello")
public String hello(){ log.info("hello");return "Hello!"; }
/**
* 分页列表查询
*/
@GetMapping(value = "/list")
public Result<?> queryPageList(AiReportScienceFileVo aiReportScienceFileVo,
@RequestParam(name = "pageNo", defaultValue = "1") Integer pageNo,
@RequestParam(name = "pageSize", defaultValue = "10") Integer pageSize,
@RequestParam(name = "column", defaultValue = "common") String column,
@RequestParam(name = "order", defaultValue = "desc") String order) {
IPage<AiReportScienceFile> pageList = aiReportScienceFileService.queryPageList(aiReportScienceFileVo, pageNo, pageSize, column, order);
return Result.OK(pageList);
}
/**
* 分页列表查询
*/
@GetMapping(value = "/listToTxt")
public Result<?> listToTxt(AiReportScienceFileVo aiReportScienceFileVo,
@RequestParam(name = "pageNo", defaultValue = "1") Integer pageNo,
@RequestParam(name = "pageSize", defaultValue = "10") Integer pageSize,
@RequestParam(name = "column", defaultValue = "common") String column,
@RequestParam(name = "order", defaultValue = "desc") String order) throws Exception {
for(int i=1;i<300;i++){
column = "id";
IPage<AiReportScienceFile> pageList = aiReportScienceFileService.queryPageList(aiReportScienceFileVo, i, 100, column, order);
for (AiReportScienceFile hit : pageList.getRecords()) {
if(StringUtils.isNotEmpty(hit.getContent())){
// 处理搜索结果...
String title = hit.getTitle();
String content = hit.getContent();
String id = hit.getId();
System.out.println(i+"==id==========:"+id);
if(hit.getDataType()==0){
generateTxt(content,title,"F:\\期刊txt\\");
}else if(hit.getDataType()==1){
generateTxt(content,title,"F:\\学术论文txt\\");
}else if(hit.getDataType()==2){
generateTxt(content,title,"F:\\学术论文txt\\");
}
}
}
}
return Result.OK();
}
/**
* 通过问题推荐素材内容
*/
@GetMapping(value = "/suggestContent")
public Result<?> suggestContent(String text, String type,
@RequestParam(name = "pageNo", defaultValue = "1") Integer pageNo,
@RequestParam(name = "pageSize", defaultValue = "5") Integer pageSize) {
if (StringUtils.isBlank(text) || StringUtils.isBlank(type)) {
return Result.error("参数异常");
}
IPage<Map<String, Object>> suggestContent = aiReportScienceFileService.suggestContent(text, type, pageNo, pageSize);
return Result.OK(suggestContent);
}
/**
* 查找当前段落文章上下文内容
*/
@GetMapping(value = "/getContentList")
public Result<?> getContentList(String textId) {
if (StringUtils.isBlank(textId)) {
return Result.error("参数异常");
}
try {
Map<String, Object> contextByTextId = aiReportScienceFileService.getContextByTextId(textId);
return Result.OK(contextByTextId);
} catch (Exception e) {
return Result.error(e.getMessage());
}
}
/**
* 通过id查询
*/
@PostMapping(value = "/queryInfoById")
public Result<?> queryInfoById(String id) {
if (StringUtils.isBlank(id)) {
return Result.error("参数异常!");
}
AiReportScienceFile aiReportScienceFile = aiReportScienceFileService.getById(id);
if(aiReportScienceFile!=null && StringUtils.isNotEmpty(aiReportScienceFile.getFilePathObs())){
// 使用Base64编码
String encodedString = Base64.getEncoder().encodeToString(aiReportScienceFile.getFilePathObs().getBytes());
aiReportScienceFile.setPreviewObs(kkfileUrl + encodedString);
}
return Result.OK(aiReportScienceFile);
}
/**
* 保存编辑的content字段
*/
@PostMapping(value = "/saveContent")
public Result<?> saveContent(@RequestBody AiReportScienceFile aiReportScienceFile) {
if (StringUtils.isBlank(aiReportScienceFile.getId())) {
return Result.error("参数异常!");
}
AiReportScienceFile scienceFileFromDb = aiReportScienceFileService.getById(aiReportScienceFile.getId());
if(StringUtils.isNotEmpty(scienceFileFromDb.getAuditBy())){
aiReportScienceFile.setCheckBy(aiReportScienceFile.getAuditBy());
aiReportScienceFile.setCheckTime(aiReportScienceFile.getAuditTime());
aiReportScienceFile.setAuditBy(null);
aiReportScienceFile.setAuditTime(null);
}
// BeanUtils.copyProperties(aiReportScienceFile, scienceFileFromDb);
// scienceFileFromDb.setContent(aiReportScienceFile.getContent());
aiReportScienceFileService.updateById(aiReportScienceFile);
return Result.OK(aiReportScienceFile);
}
/**
* 保存编辑的content字段
*/
@PostMapping(value = "/auditScience")
public Result<?> auditScience(HttpServletRequest request, @RequestBody AiReportScienceFile aiReportScienceFile) {
if (StringUtils.isBlank(aiReportScienceFile.getId())) {
return Result.error("参数异常!");
}
// 如果有传递content,则先保存内容之后再执行拆分入向量库
aiReportScienceFile.setStatus(2);
//TODO 编辑者信息
// LoginUser user = (LoginUser) SecurityUtils.getSubject().getPrincipal();
// aiReportScienceFile.setAuditBy(user.getRealname());
aiReportScienceFile.setAuditTime(new Date());
this.saveContent(aiReportScienceFile);
try {
AiReportScienceFile scienceFile = aiReportScienceFileService.getById(aiReportScienceFile.getId());
aiReportScienceFileService.deleteDataByFileId(aiReportScienceFile.getId());
try {
// 将要删除的段落id推送到kafka中
QueryWrapper<AiReportScienceFileMaterial> queryWrapper = new QueryWrapper<>();
queryWrapper.lambda().eq(AiReportScienceFileMaterial::getFileId, scienceFile.getId())
.select(AiReportScienceFileMaterial::getId);
List<AiReportScienceFileMaterial> list = aiReportScienceFileMaterialService.list(queryWrapper);
if (CollectionUtil.isNotEmpty(list)) {
List<String> collect = list.stream().map(AiReportScienceFileMaterial::getId).collect(Collectors.toList());
//ReportUtil.sendKafka(collect); //TODO 发送kafka
}
} catch (Exception e) {
log.error("删除数据入向量库失败",e);
}
aiReportScienceFileMaterialService.splitScienceFileMaterial(scienceFile.getContent(), scienceFile.getId());
} catch (Exception e) {
log.error("拆分失败!", e);
return Result.error("拆分失败!");
}
return Result.OK();
}
/*
@Resource
private StreamBridge streamBridge;
@GetMapping(value = "/sendFileMaterial")
public void sendFileMaterial() {
QueryWrapper query = new QueryWrapper();
query.eq("deleted", "0");
query.eq("status","2");
query.gt("audit_time","2023-12-19 11:00:00");
query.select("id","origin","data_type");
query.orderByAsc("id");
List<AiReportScienceFile> listAiReportScienceFile = aiReportScienceFileService.list(query);
int i=0;
int j=0;
for(AiReportScienceFile obj:listAiReportScienceFile){
i++;
System.out.println("i====:"+i+"=====id============="+obj.getId());
QueryWrapper query1 = new QueryWrapper();
query1.eq("file_id", obj.getId());
query1.eq("deleted", "0");
query1.ne("content_type","img");
query1.ne("content_type","table");
List<AiReportScienceFileMaterial> listAiReportScienceFileMaterial = aiReportScienceFileMaterialService.list(query1);
//素材kafka推送
streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
log.debug("期刊论文素材推送kafka开始======");
int k=0;
int a=0;
for(AiReportScienceFileMaterial obj1:listAiReportScienceFileMaterial){
a++;
if(StringUtils.isNotEmpty(obj1.getContent()) && obj1.getContent().contains("<p") && obj1.getContent().contains("</p>")
&& countChineseCharacters(Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj1.getContent())))<36){
System.out.println("过滤======a============:"+a);
continue;
}
//推送kafka
JSONObject jo = new JSONObject();
jo.put("origin", obj.getOrigin());
jo.put("fileId", obj.getId());
jo.put("status", "2");
jo.put("fileType", getFileType(obj.getDataType()));
jo.put("textId", obj1.getId());
jo.put("text", Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj1.getContent())));
jo.put("textType", getTextType(obj1.getContentType()));
j++;
k++;
System.out.println("推送====j==================:"+j+"====a============:"+a+"====k============:"+k);
streamBridge.send("science_file_1219", jo);
log.debug("推送成功,段落id:"+obj1.getId());
}
// ReportUtil.sendKafka2(listAiReportScienceFileMaterial, obj.getId(), obj.getDataType(),obj.getOrigin(),"2");
}
}
*/
/**
* 通过传入的文本类型
* @param text
* @return
*/
private static String getTextType(String text){
if (text.equals("p")) {
return "内容";
}else if (text.contains("h")) {
return "标题";
}else if (text.equals("img")) {
return "图片";
}else if (text.equals("table")) {
return "表格";
}else {
return "内容";
}
}
/**
* 通过传入的文本类型
* @param type
* @return
*/
private static String getFileType(Integer type){
if (type == 0) {
return "期刊";
}else if (type == 1) {
return "博士论文";
}else if (type == 2) {
return "硕士论文";
}else if (type == 3) {
return "图书";
}else if (type == 4) {
return "研报";
}else {
return "期刊";
}
}
public static int countChineseCharacters(String str) {
Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]");
Matcher matcher = pattern.matcher(str);
int count = 0;
while (matcher.find()) {
count++;
}
return count;
}
/**
* 通过id删除
*/
@GetMapping(value = "/deleteById")
public Result<?> delete(@RequestParam(name = "id") String id) {
if (StringUtils.isBlank(id)) {
return Result.error("id不能为空");
}
AiReportScienceFile scienceFile = aiReportScienceFileService.getById(id);
if (scienceFile == null) {
return Result.error("数据不存在");
}
scienceFile.setDeleted("1");
aiReportScienceFileService.deleteDataByFileId(id);
aiReportScienceFileService.updateById(scienceFile);
return Result.OK("删除成功!");
}
/**
* 通过id删除
*/
@GetMapping(value = "/getAuditPersonList")
public Result<?> getAuditPersonList() {
List<String> list = aiReportScienceFileService.getAuditPersonList();
return Result.OK(list);
}
/**
* 上传期刊论文图书
*
* @param
* @return
*/
@PostMapping(value = "/uploadScience")
public Result<?> uploadScience(HttpServletRequest request, AiReportScienceFile aiReportScienceFile) {
try {
Boolean flag = aiReportScienceFileService.uploadAiMaterial(aiReportScienceFile, request);
return Result.OK("操作成功");
} catch (Exception e) {
return Result.OK("操作失败");
}
}
/**
*
* @param
* @return
*/
/*
@PostMapping(value = "/uploadScienceTuShu")
public Result<?> uploadScienceTuShu(HttpServletRequest request, AiReportScienceFile aiReportScienceFile) {
try {
try {
int i = 0;
MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
MultiValueMap<String, MultipartFile> files = multipartRequest.getMultiFileMap();
LinkedList<MultipartFile> aa = (LinkedList<MultipartFile>) files.get("file");
for (MultipartFile file : aa) {
i++;
long time1 = new Date().getTime();
String fileName = file.getOriginalFilename();//获取文件名
String prefix = fileName.substring(fileName.lastIndexOf(".") + 1);
AiReportScienceFile reportScienceFile = new AiReportScienceFile();
if (fileName.contains("_")) {
reportScienceFile.setTitle(fileName.substring(0, fileName.lastIndexOf("_")));
} else if (fileName.contains(".")) {
reportScienceFile.setTitle(fileName.substring(0, fileName.indexOf(".")));
}
if (!"docx".equals(prefix) && !"doc".equals(prefix) && !"pdf".equals(prefix)) {
throw new Exception("目前支持doc、docx、txt、pdf格式");
}
if (file.getOriginalFilename().endsWith("docx") || file.getOriginalFilename().endsWith("doc")) {
//文件上传
System.out.println(i + ":标题:" + reportScienceFile.getTitle() + "-开始拆分");
//获取目录
InputStream fileInputStream = file.getInputStream();
XWPFDocument document = new XWPFDocument(fileInputStream);
String content = ReportUtil.getwordHtmlOnlyText(document);
List<String> contentList = ReportUtil.beautifyContent2(content);//lxp
reportScienceFile.setContent(String.join("", contentList));
System.out.println(i + ":标题:" + reportScienceFile.getTitle() + "-拆分完成");
long time2 = new Date().getTime();
System.out.println("拆分耗时:"+(time2 - time1));
if(StringUtils.isNotEmpty(reportScienceFile.getContent())){
// 处理搜索结果...
String title1 = reportScienceFile.getTitle();
String content1 = reportScienceFile.getContent();
generateTxt(content1,title1,"D:\\图书txt\\");
long time3 = new Date().getTime();
System.out.println("txt耗时:"+(time3 - time2));
}
// byteArrayOutputStream.close();
document.close();
fileInputStream.close();
// inputStream.close();
} else if (file.getOriginalFilename().endsWith("pdf")) {
} else {
System.out.println("上传文件类型错误!");
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
return Result.OK("操作成功");
} catch (Exception e) {
return Result.OK("操作失败");
}
}
*/
public static String path = "G:\\论文(word)\\论文(word)\\8.8期刊word31\\地方国资国企改革背景下内审体系的创新发展_赵金祥.docx";
private static int offset = 100;//偏移量
@Autowired
//private static ObsUtil obsUtil;
public static void main(String[] args) throws Exception {
// 要编码的字符串
String originalString = "http://zzsn.luyuen.com/report/reportScience/5becff31-198e-49b6-89cb-6be2d111e032.docx";
// 使用Base64编码
String encodedString = Base64.getEncoder().encodeToString(originalString.getBytes());
System.out.println("Encoded String: " + encodedString);
// 使用Base64解码
byte[] decodedBytes = Base64.getDecoder().decode(encodedString);
String decodedString = new String(decodedBytes);
System.out.println("Decoded String: " + decodedString);
String aa = "";
System.out.println(aa);
// File file = new File(path);
// FileInputStream input = new FileInputStream(file);
// XWPFDocument document = new XWPFDocument(input);
// List<String> directoryList = new ArrayList<>();
// setNoDirectory(document, directoryList);
//
//
//
// InputStream inputStream = null;
// ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
// document.write(byteArrayOutputStream);
// inputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray());
// String text = DocUtil.convertDocStream2Html(inputStream);
// System.out.println(aa);
// text = text.replaceAll("__space__one1", "<h1>");
// text = text.replaceAll("__space__one2", "</h1>");
// text = text.replaceAll("__space__two1", "<h2>");
// text = text.replaceAll("__space__two2", "</h2>");
// text = text.replaceAll("__space__three1", "<h3>");
// text = text.replaceAll("__space__three2", "</h3>");
// text = text.replaceAll("&nbsp;","");
// text = text.substring(text.indexOf("<body>"),text.indexOf("</body>"));
// Elements aaa = getDirectory(text);
// System.out.println(11);
// // 解析HTML字符串
// Document doc = Jsoup.parse(text);
// // 提取所有的p标签、img标签和table标签
// Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
// Elements elements1 = new Elements();
// String hh = "";
// // 打印提取结果、去多余空格
// for (Element element : elements) {
// if(element.toString().contains("<img") || element.toString().contains("<table")){
// elements1.add(element);
// }else{
// if(!element.text().equals("") && !chunshuzi(element.text()) && !element.text().contains("http://www.cnki.net")
// && (!element.text().contains("(c)") && !element.text().contains("china"))){
// if(!calculateEnglishRatio(element.text())){
// String text1 = element.text().replaceAll(" ","").replaceAll("&nbsp;","");
// element.text(text1);
// }
// elements1.add(element);
// }
// }
// }
// //优化段落
// Boolean flag = true;
// String text2 = "";
// Iterator<Element> iterator = elements1.iterator();
// List<String> content = new ArrayList<>();
//
// while(iterator.hasNext()) {
// Element element = iterator.next();
// String str = element.toString();
// if(str.contains("<p") && str.contains("</p>")){
// element.clearAttributes();
// element.attr("style","font-size:12pt;text-indent:2em");
// str = element.toString();
// }
// if(str.contains("<h") && str.contains("</h")){
// flag = false;
// }
// //先拼接正文前数据
// if(flag){
// hh = hh + element;
// content.add(str);
// }else{
// //开始优化段落内容
// if(str.contains("<p") && str.contains("</p>") && !element.text().equals("") && !cankaowenxian(element.text().replaceAll(" ",""))){
// if(!calculateEnglishRatio(element.text())){
// if(!element.text().endsWith("。") && !isDigit(element.text()) && !element.text().startsWith("关键词")
// && !element.text().startsWith("目录")){
// text2 = text2 + element.text();
// iterator.remove();
// }else{
// if(!text2.equals("")){
// text2 = text2 + element.text();
// element.clearAttributes();
// element.attr("style","font-size:12pt;text-indent:2em");
// element.text(text2);
// text2 = "";
// hh = hh + element;
// content.add(element.toString());
// }else{
// hh = hh + element;
// content.add(element.toString());
// }
// }
// }else{
// hh = hh + element;
// content.add(element.toString());
// }
// }else{
// hh = hh + element;
// content.add(element.toString());
// }
// }
// }
// System.out.println(elements1);
//
// List<Map<String, Object>> li = getList(content);
// System.out.println(li);
}
public static Elements getDirectory(String text) {
// 解析HTML字符串
Document doc = Jsoup.parse(text);
// 提取所有的p标签、img标签和table标签
Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
Elements elements1 = new Elements();
String hh = "";
// 打印提取结果、去多余空格
//获取文章目录标题
int startflag=0;
int maxlever = 0;
Map<String, String> titleLev = new HashMap<String, String>();
for (Element element : elements) {
if(element.toString().contains("<img") || element.toString().contains("<table")){
elements1.add(element);
}else{
if(!element.text().equals("")){
String parentText = element.text();
String parentText2 = element.text();
if (parentText.contains("关键词") && startflag == 0) {
startflag = 1;
}
if(!calculateEnglishRatio(parentText)){
//判断是不是标题目录
if((parentText.matches("[一二三四五六七八九十]+.*") ||
parentText.startsWith("(") ||
parentText.startsWith("(") ||
parentText.matches("\\d+.*") ) &&
parentText.length()<50 &&
!parentText.startsWith("中图分类号") &&
startflag == 1
){
String tKey = getTilteNum(parentText);
if (null==tKey || tKey.trim().length()==0) {
continue;
}
//获取目录的层级
String lever = titleLev.get(tKey);
if (null!=lever ) {
parentText = "<h"+lever+">"+parentText+"</h"+lever+">";
} else if (titleLev.size()==0) {
maxlever++;
titleLev.put(tKey,String.valueOf(maxlever));
parentText = "<h1>"+parentText+"</h1>";
} else {
maxlever++;
titleLev.put(tKey,String.valueOf(maxlever));
parentText = "<h"+maxlever+">"+parentText+"</h"+maxlever+">";
}
parentText = parentText.replaceAll(" ","").replaceAll("&nbsp;","");
System.out.println(parentText);
}
if(parentText.startsWith("<h")){
String tag = parentText.substring(1,3);
element.tagName(tag);
element.html(parentText2);
}
}
elements1.add(element);
}
}
}
return elements1;
}
public static String getTilteNum(String title) {
String result = "";
if (Pattern.matches("\\d+", title)) {
return result ;
}
if(title.matches("\\d+.*") ) {
StringBuilder result1 = new StringBuilder();
for (char c : title.toCharArray()) {
if (Character.isDigit(c) || c=='.') {
result1.append(c);
} else {
break;
}
}
result = String.valueOf(result1);
result = result.replaceAll("\\d", "1");
} else if (title.matches("[一二三四五六七八九十]+.*") ) {
if (title.contains("、") || title.contains(" ") || title.contains(".")) {
result = "一";
}
} else if (title.startsWith("(") ||
title.startsWith("(")) {
if (title.substring(1).matches("\\d+.*") ) {
result = "(1)";
} else if (title.substring(1).matches("[一二三四五六七八九十]+.*")) {
result = "(一)";
}
}
return result ;
}
public static boolean chunshuzi (String str) {
Pattern pattern = Pattern.compile("^\\d+$");
Matcher matcher = pattern.matcher(str);
return matcher.find();
}
public static List<Map<String, Object>> getList(List<String> originList) {
Stack<Map<String, Object>> stack = new Stack<>();
Map<String, Object> map1 = new HashMap<>();
map1.put("id", null);
map1.put("text", null);
map1.put("level", 1000);
stack.push(map1);
List<Map<String, Object>> list = new ArrayList<>();
int sort = 1;
for (String text : originList) {
Map<String, Object> map = new HashMap<>();
map.put("id", UUID.randomUUID().toString().replaceAll("-", ""));
map.put("text", text);
map.put("level", getLevel(text));
getParent(map, stack);
if (canPush(map, stack)) {
stack.push(map);
}
map.put("contentType", getContentType(text));
map.put("sort", sort++);
list.add(map);
}
stack.clear();
return list;
}
public static boolean cankaowenxian (String str) {
Pattern pattern = Pattern.compile("^\\[[1234567890]+\\]");
Matcher matcher = pattern.matcher(str);
Pattern pattern1 = Pattern.compile("^\\[[1234567890]+\\]");
Matcher matcher1 = pattern1.matcher(str);
if(matcher.find() || matcher1.find()){
return true;
}else{
return false;
}
}
/**
* 通过传入的文本开头标签,判断文本类型 p/h1/h2/h3/h4/img/table
*
* @param text
* @return
*/
private static String getContentType(String text) {
if (text.startsWith("<p")) {
return "p";
} else if (text.startsWith("<h1")) {
return "h1";
} else if (text.startsWith("<h2")) {
return "h2";
} else if (text.startsWith("<h3")) {
return "h3";
} else if (text.startsWith("<h4")) {
return "h4";
} else if (text.startsWith("<img")) {
return "img";
} else if (text.startsWith("<table")) {
return "table";
} else {
return "";
}
}
/**
* 判断源数据是否能放入栈中
*
* @param source 源数据
* @param stack 栈数据
* @return true or false
*/
private static boolean canPush(Map<String, Object> source, Stack<Map<String, Object>> stack) {
if (stack.isEmpty()) {
return true;
}
String text = (String) source.get("text");
if (!text.startsWith("<h")) {
return false;
}
Map<String, Object> peek = stack.peek();
int level = (int) peek.get("level");
int sourceLevel = (int) source.get("level");
return sourceLevel > level;
}
/**
* 给文件赋值 parent 属性
*
* @param source
* @param stack
*/
private static void getParent(Map<String, Object> source, Stack<Map<String, Object>> stack) {
if (stack.isEmpty()) {
source.put("parent", "0");
}
Map<String, Object> peek = stack.peek();
String id = (String) peek.get("id");
int level = (int) peek.get("level");
int sourceLevel = (int) source.get("level");
if (sourceLevel > level) {
source.put("parent", id);
} else {
boolean f = false;
while (sourceLevel <= level) {
stack.pop();
if (stack.isEmpty()) {
source.put("parent", "0");
f = true;
break;
}
peek = stack.peek();
level = (int) peek.get("level");
}
if (!f) {
id = (String) peek.get("id");
source.put("parent", id);
}
}
}
/**
* 通过 h 标签的判断文本层级
*
* @param text
* @return
*/
private static Integer getLevel(String text) {
if (StringUtils.isBlank(text)) {
return null;
}
Matcher matcher = Pattern.compile("^(<h1>|<h2>|<h3>|<h4>|<h5>|<h6>)").matcher(text);
if (matcher.find()) {
String tag = matcher.group();
switch (tag) {
case "<h1>":
return 1;
case "<h2>":
return 2;
case "<h3>":
return 3;
case "<h4>":
return 4;
case "<h5>":
return 5;
case "<h6>":
return 6;
default:
return 100;
}
} else {
return 100;
}
}
// 判断英文字符的比重是否大于百分之50
public static boolean calculateEnglishRatio(String str) {
int englishCount = 0;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (isEnglish(c)) {
englishCount++;
}
}
double englishRatio = (double) englishCount / str.length();
return englishRatio > 0.5;
}
// 判断字符是否为英文字符
public static boolean isEnglish(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
// 判断英文字符的比重是否大于百分之50
public static boolean isDigit(String str) {
char lastChar = str.charAt(str.length() - 1);
boolean isDigit = Character.isDigit(lastChar);
return isDigit;
}
final String lineSeparator = System.getProperty("line.separator");
private void generateTxt(String content,String title,String path) {
content = content.replaceAll("</p>", "</p>" + lineSeparator);
content = content.replaceAll("</h1>", "</h1>" + lineSeparator);
content = content.replaceAll("</h2>", "</h2>" + lineSeparator);
content = content.replaceAll("</h3>", "</h3>" + lineSeparator);
content = content.replaceAll("</h4>", "</h4>" + lineSeparator);
content = content.replaceAll(" +", " ");
content = HtmlUtil.cleanHtmlTag(HtmlUtil.removeHtmlTag(content,"style","script","img","table"));
if (content.length() < 20) {
return;
}
if (title.contains("/") || title.contains(":")) {
title = title.replaceAll("[/:]", "_");
}
File file = null;
try {
file = FileUtil.touch(new File(path + title + ".txt"));
FileUtil.writeString(title + "\n" + content, file, StandardCharsets.UTF_8);
} catch (IORuntimeException ignore) {
}
}
}
package com.zzsn.knowbase.entity;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;
import org.springframework.format.annotation.DateTimeFormat;
import java.io.Serializable;
import java.util.Date;
/**
* 期刊论文/图书基本信息表
*
*/
@Data
@TableName("ai_report_science_file")
public class AiReportScienceFile implements Serializable {
/**
* 主键id
*/
@TableId(value = "id", type = IdType.ASSIGN_ID)
private String id;
/**
* 资料内容
*/
@TableField("content")
private String content;
/**
* 资料内容格式(text/html/url/img)
*/
@TableField("content_type")
private String contentType;
/**
* 资料标题
*/
@TableField("title")
private String title;
/**
* 资料标题
*/
@TableField("key_words")
private String keyWords;
/**
* 年份
*/
@TableField("year")
private String year;
/**
* 作者
*/
@TableField("author")
private String author;
/**
* 来源
*/
@TableField("origin")
private String origin;
/**
* 发布时间
*/
@TableField("publish_date")
private String publishDate;
/**
* 资料类型(0:期刊、1:博士论文 2:硕士论文、3:图书)
*/
@TableField("data_type")
private Integer dataType;
/**
* 源文件地址
*/
@TableField("zip_file_url")
private String zipFileUrl;
/**
* 状态(0:未审核、1:审核不通过 2:审核通过)
*/
@TableField("status")
private Integer status;
/**
* 审核人
*/
@TableField("audit_by")
private String auditBy;
/**
* 审核时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("audit_time")
private Date auditTime;
/**
* 创建人
*/
@TableField("create_by")
private String createBy;
/**
* 创建时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("create_time")
private Date createTime;
/**
* 更新人
*/
@TableField("update_by")
private String updateBy;
/**
* 更新时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("update_time")
private Date updateTime;
/**
* 是否删除(0否 1是)
*/
@TableField("deleted")
private String deleted;
/**
* 文件路径
*/
private String filePathObs;
/**
* 封面路径
*/
private String coverObs;
/**
* 预览路径
*/
private String previewObs;
/**
* 校验人
*/
@TableField("check_by")
private String checkBy;
/**
* 校验时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("check_time")
private Date checkTime;
}
package com.zzsn.knowbase.entity;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;
import org.springframework.format.annotation.DateTimeFormat;
import java.io.Serializable;
import java.util.Date;
/**
* 期刊论文/图书-拆分内容结构表
*
*/
@Data
@TableName("ai_report_science_file_material")
public class AiReportScienceFileMaterial implements Serializable {
/**
* 主键id
*/
@TableId(value = "id", type = IdType.ASSIGN_ID)
private String id;
/**
* 资料id
*/
@TableField("file_id")
private String fileId;
/**
* 资料内容
*/
@TableField("content")
private String content;
/**
* 内容类型(science:文章 chaper:章节 chaperContent:章节内容 paragraph:段落 paragraphContent:段落内容
*/
@TableField("content_type")
private String contentType;
/**
* 层级
*/
@TableField("level")
private String level;
/**
* 父级id
*/
@TableField("parent")
private String parent;
/**
* 排序
*/
@TableField("sort")
private Integer sort;
/**
* 创建人
*/
@TableField("create_by")
private String createBy;
/**
* 创建时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("create_time")
private Date createTime;
/**
* 更新人
*/
@TableField("update_by")
private String updateBy;
/**
* 更新时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@TableField("update_time")
private Date updateTime;
/**
* 是否删除(0否 1是)
*/
@TableField("deleted")
private String deleted;
}
package com.zzsn.knowbase.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import org.apache.ibatis.annotations.Mapper;
/**
* @Version: V1.0
*/
@Mapper
public interface AiReportScienceFileMapper extends BaseMapper<AiReportScienceFile> {
}
package com.zzsn.knowbase.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import org.apache.ibatis.annotations.Mapper;
/**
* @Version: V1.0
*/
@Mapper
public interface AiReportScienceFileMaterialMapper extends BaseMapper<AiReportScienceFileMaterial> {
}
package com.zzsn.knowbase.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
/**
* @Version: V1.0
*/
public interface IAiReportScienceFileMaterialService extends IService<AiReportScienceFileMaterial> {
public void splitScienceFileMaterial(String html, String fileId);
}
package com.zzsn.knowbase.service;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.service.IService;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.vo.AiReportScienceFileVo;
import javax.servlet.http.HttpServletRequest;
import java.util.List;
import java.util.Map;
/**
* @Version: V1.0
*/
public interface IAiReportScienceFileService extends IService<AiReportScienceFile> {
public Boolean uploadAiMaterial(AiReportScienceFile aiReportScienceFile, HttpServletRequest request) throws Exception;
IPage<AiReportScienceFile> queryPageList(AiReportScienceFileVo aiReportScienceFileVo, Integer pageNo, Integer pageSize, String column, String order);
IPage<Map<String, Object>> suggestContent(String text, String type, Integer pageNo, Integer pageSize);
Map<String,Object> getContextByTextId(String textId);
void deleteDataByFileId(String id);
List<String> getAuditPersonList();
}
package com.zzsn.knowbase.service.impl;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import com.zzsn.knowbase.mapper.AiReportScienceFileMaterialMapper;
import com.zzsn.knowbase.service.IAiReportScienceFileMaterialService;
import com.zzsn.knowbase.service.IAiReportScienceFileService;
import com.zzsn.knowbase.util.ReportUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* @Version: V1.0
*/
@Service
public class AiReportScienceFileMaterialServiceImpl extends ServiceImpl<AiReportScienceFileMaterialMapper, AiReportScienceFileMaterial> implements IAiReportScienceFileMaterialService {
@Autowired
private IAiReportScienceFileMaterialService aiReportScienceFileMaterialService;
@Autowired
private IAiReportScienceFileService aiReportScienceFileService;
// @Resource
// private StreamBridge streamBridge;
public void splitScienceFileMaterial(String html,String fileId){
QueryWrapper query1 = new QueryWrapper();
query1.eq("file_id",fileId);
aiReportScienceFileMaterialService.remove(query1);
List<AiReportScienceFileMaterial> list = new ArrayList<>();
AiReportScienceFile scienceFile = aiReportScienceFileService.getById(fileId);
// 使用Jsoup解析HTML字符串
// 解析HTML字符串
Document doc = Jsoup.parse(html);
// 提取所有的p标签、img标签和table标签
Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4,h5,h6,h7,h8,h9");
Iterator<Element> iterator = elements.iterator();
List<String> contentList = new ArrayList<>();
while(iterator.hasNext()) {
Element element = iterator.next();
contentList.add(element.toString());
}
/*
获取所有段落层级数据
*/
List<Map<String, Object>> li = ReportUtil.getList(contentList);
List<AiReportScienceFileMaterial> listMaterial = new ArrayList<>();
for (Map<String, Object> str : li) {
AiReportScienceFileMaterial aiReportScienceFileMaterial = new AiReportScienceFileMaterial();
aiReportScienceFileMaterial.setParent(str.get("parent").toString());
aiReportScienceFileMaterial.setContentType(str.get("contentType").toString());
aiReportScienceFileMaterial.setFileId(fileId);
aiReportScienceFileMaterial.setContent(str.get("text").toString());
aiReportScienceFileMaterial.setLevel(String.valueOf(str.get("level")));
aiReportScienceFileMaterial.setSort((Integer) str.get("sort"));
aiReportScienceFileMaterial.setDeleted("0");
listMaterial.add(aiReportScienceFileMaterial);
}
aiReportScienceFileMaterialService.saveBatch(listMaterial);
QueryWrapper query = new QueryWrapper();
query.eq("file_id",fileId);
query.eq("deleted","0");
query.ne("content_type","img");
query.ne("content_type","table");
List<AiReportScienceFileMaterial> listAiReportScienceFileMaterial = aiReportScienceFileMaterialService.list(query);
//ReportUtil.sendKafka(listAiReportScienceFileMaterial,fileId,scienceFile.getDataType(),scienceFile.getOrigin(),"2");//审核通过
//TODO 发送kafka
}
}
package com.zzsn.knowbase.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.http.HtmlUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import com.zzsn.knowbase.mapper.AiReportScienceFileMapper;
import com.zzsn.knowbase.service.IAiReportScienceFileMaterialService;
import com.zzsn.knowbase.service.IAiReportScienceFileService;
import com.zzsn.knowbase.util.DateUtil;
import com.zzsn.knowbase.util.HttpUtil;
import com.zzsn.knowbase.util.MD5Util;
import com.zzsn.knowbase.util.ReportUtil;
import com.zzsn.knowbase.vo.AiReportScienceFileVo;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Service;
import org.springframework.util.MultiValueMap;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.MultipartHttpServletRequest;
import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* @Version: V1.0
*/
@Service
public class AiReportScienceFileServiceImpl extends ServiceImpl<AiReportScienceFileMapper, AiReportScienceFile> implements IAiReportScienceFileService {
@Autowired
private IAiReportScienceFileService aiReportScienceFileService;
@Autowired
private IAiReportScienceFileMaterialService aiReportScienceFileMaterialService;
@Resource
private StringRedisTemplate stringRedisTemplate;
// private static final String BASE_URL = "http://116.63.179.212:7862/";
private static final String BASE_URL = "http://114.115.172.99:10013/";
@Override
public IPage<AiReportScienceFile> queryPageList(AiReportScienceFileVo aiReportScienceFileVo, Integer pageNo, Integer pageSize, String column, String order) {
IPage<AiReportScienceFile> page = new Page<>(pageNo, pageSize);
QueryWrapper<AiReportScienceFile> queryWrapper = new QueryWrapper<>();
if (StringUtils.isNotEmpty(aiReportScienceFileVo.getAuthor())) {
queryWrapper.like("author", aiReportScienceFileVo.getAuthor());
}
if (StringUtils.isNotEmpty(aiReportScienceFileVo.getOrigin())) {
queryWrapper.like("origin", aiReportScienceFileVo.getOrigin());
}
if (aiReportScienceFileVo.getDataType() != null) {
if (aiReportScienceFileVo.getDataType() == 5) {
queryWrapper.in("data_type", Arrays.asList(1, 2));
} else {
queryWrapper.eq("data_type", aiReportScienceFileVo.getDataType());
}
}
if (StringUtils.isNotBlank(aiReportScienceFileVo.getPubStartTime())) {
if (aiReportScienceFileVo.getDataType() == 5) {
String substring = aiReportScienceFileVo.getPubStartTime().substring(0, 4)+"年";
queryWrapper.ge("publish_date", substring);
} else {
queryWrapper.ge("publish_date", aiReportScienceFileVo.getPubStartTime());
}
}
if (StringUtils.isNotBlank(aiReportScienceFileVo.getPubEndTime())) {
if (aiReportScienceFileVo.getDataType() == 5) {
String substring = aiReportScienceFileVo.getPubEndTime().substring(0, 4)+"年";
queryWrapper.le("publish_date", substring);
} else {
queryWrapper.le("publish_date", aiReportScienceFileVo.getPubEndTime());
}
}
if (StringUtils.isNotEmpty(aiReportScienceFileVo.getSearchWords())) {
if ("标题".equals(aiReportScienceFileVo.getSearchType())) {
queryWrapper.like("title", aiReportScienceFileVo.getSearchWords());
} else {
queryWrapper.like("content", aiReportScienceFileVo.getSearchWords());
}
}
if (StringUtils.isBlank(column)) {
column = "common";
}
if ("common".equals(column)) {
queryWrapper.orderBy(true, true, "status")
.orderBy(true, false, "audit_time", "update_time");
} else {
queryWrapper.orderBy(true, false, column);
}
if (StringUtils.isNotBlank(aiReportScienceFileVo.getAuditBy())) {
queryWrapper.like("audit_by", aiReportScienceFileVo.getAuditBy());
}
if (StringUtils.isNotBlank(aiReportScienceFileVo.getAuditTimeStart()) ) {
if (aiReportScienceFileVo.getAuditTimeStart().length() == 10) {
aiReportScienceFileVo.setAuditTimeStart(aiReportScienceFileVo.getAuditTimeStart() + " 00:00:00");
}
queryWrapper.ge("audit_time", aiReportScienceFileVo.getAuditTimeStart());
}
if (StringUtils.isNotBlank(aiReportScienceFileVo.getAuditTimeEnd())) {
if (aiReportScienceFileVo.getAuditTimeEnd().length() == 10) {
aiReportScienceFileVo.setAuditTimeEnd(aiReportScienceFileVo.getAuditTimeEnd() + " 23:59:59");
}
queryWrapper.le("audit_time", aiReportScienceFileVo.getAuditTimeEnd());
}
if (aiReportScienceFileVo.getStatus() != null) {
queryWrapper.eq("status", aiReportScienceFileVo.getStatus());
}
queryWrapper.eq("deleted", "0");
String[] queryColumn = {"id", "content_type", "file_path_obs", "title", "cover_obs", "preview_obs",
"year", "author", "origin", "data_type","check_by","check_time",
"zip_file_url", "status", "publish_date",
"create_by", "create_time", "update_by", "update_time","audit_by","audit_time"
};
// 列表不查询content字段
queryWrapper.select(queryColumn);
queryWrapper.eq("deleted", "0");
return this.baseMapper.selectPage(page, queryWrapper);
}
/**
* 素材推荐
* 1. 通过模型接口获取最相关推荐文本内容
* 2. 根据返回的 fileId 和 textId 查询数据库相近文件段落内容
* 3. 拼接后返回推荐的文本
*/
@Override
public IPage<Map<String, Object>> suggestContent(String text, String type, Integer pageNo, Integer pageSize) {
// 首先从缓存中拿数据
String key = MD5Util.MD5Encode("ReportScienceFile:"+text + type, "UTF-8");
String value = stringRedisTemplate.opsForValue().get(key);
JSONArray jsonArray = null;
if (value != null) {
jsonArray = JSONArray.parseArray(value);
} else {
jsonArray = postSuggest(text, type);
}
List<Map<String, Object>> list = new ArrayList<>();
if (!jsonArray.isEmpty()) {
QueryWrapper<AiReportScienceFileMaterial> queryWrapper1 = getAiReportScienceFileMaterialQueryWrapper(jsonArray);
List<AiReportScienceFileMaterial> resultList = aiReportScienceFileMaterialService.list(queryWrapper1);
List<String> existsId = resultList.stream().map(AiReportScienceFileMaterial::getId).collect(Collectors.toList());
if (jsonArray.size() > existsId.size()) {
List<String> notExistsIds = new ArrayList<>();
jsonArray.removeIf(object -> {
JSONObject jsonObject = (JSONObject) object;
String textId = jsonObject.getString("textId");
boolean contains = existsId.contains(textId);
if (!contains) {
notExistsIds.add(textId);
}
return !contains;
});
if (!notExistsIds.isEmpty()) {
// 说明有数据不在数据库中,需要删除向量库数据
CompletableFuture.runAsync(()-> deleteDataByContentIds(notExistsIds));
}
}
int maxCount = pageNo * pageSize;
int minCount = (pageNo - 1) * pageSize;
List<Object> objects = new ArrayList<>();
if (jsonArray.size() > maxCount) {
objects = jsonArray.subList(minCount, maxCount);
} else if (jsonArray.size() > minCount) {
objects = jsonArray.subList(minCount, jsonArray.size());
}
List<String> fileIds = new ArrayList<>();
List<String> fileMaterialIds = new ArrayList<>();
List<String> titleTextIds = new ArrayList<>();
for (int i = 0; i < objects.size(); i++) {
JSONObject jsonObject = (JSONObject) objects.get(i);
String id = jsonObject.getString("fileId");
String textId = jsonObject.getString("textId");
String textType = jsonObject.getString("textType");
if ("标题".equals(textType)) {
titleTextIds.add(textId);
}
fileIds.add(id);
fileMaterialIds.add(textId);
}
if (!fileIds.isEmpty() && !fileMaterialIds.isEmpty()) {
// 查询数据库- 图书\期刊信息
LambdaQueryWrapper<AiReportScienceFile> queryWrapper = new LambdaQueryWrapper<>();
queryWrapper.in(AiReportScienceFile::getId, fileIds);
queryWrapper.select(AiReportScienceFile::getId,AiReportScienceFile::getAuthor,AiReportScienceFile::getPublishDate,AiReportScienceFile::getTitle,AiReportScienceFile::getOrigin);
List<AiReportScienceFile> scienceFiles = aiReportScienceFileService.list(queryWrapper);
Map<String, AiReportScienceFile> collect = scienceFiles.stream().collect(Collectors.toMap(AiReportScienceFile::getId, v -> v));
List<AiReportScienceFileMaterial> fileMaterials = aiReportScienceFileMaterialService.listByIds(fileMaterialIds);
Map<String, AiReportScienceFileMaterial> fileMaterialMap = fileMaterials.stream().collect(Collectors.toMap(AiReportScienceFileMaterial::getId, v -> v));
Map<String, String> titleContentMap = new HashMap<>(titleTextIds.size());
if (!titleTextIds.isEmpty()) {
// 查询数据库- 标题
for (String titleTextId : titleTextIds) {
titleContentMap.put(titleTextId, getSubContent(titleTextId, fileMaterialMap));
}
}
for (Object object : objects) {
JSONObject jsonObject = (JSONObject) object;
String id = jsonObject.getString("fileId");
String textId = jsonObject.getString("textId");
String content = jsonObject.getString("text");
AiReportScienceFile scienceFile = collect.get(id);
// 获取标题下的内容
String titleContent = Optional.ofNullable(titleContentMap.get(textId)).orElse("");
Map<String, Object> map = new HashMap<>();
map.put("author", scienceFile.getAuthor());
map.put("publishDate", scienceFile.getPublishDate());
map.put("title", scienceFile.getTitle());
map.put("content", content + titleContent);
map.put("textId", textId);
map.put("origin", scienceFile.getOrigin());
list.add(map);
}
}
stringRedisTemplate.opsForValue().set(key, JSON.toJSONString(jsonArray), 10, TimeUnit.MINUTES);
}
IPage<Map<String, Object>> resultList = new Page<>(pageNo, pageSize);
resultList.setRecords(list);
resultList.setTotal(jsonArray.size());
return resultList;
}
// @NotNull //TODO 注释
private static QueryWrapper<AiReportScienceFileMaterial> getAiReportScienceFileMaterialQueryWrapper(JSONArray jsonArray) {
List<String> materialIds = new ArrayList<>();
for (Object object : jsonArray) {
JSONObject jsonObject = (JSONObject) object;
String textId = jsonObject.getString("textId");
materialIds.add(textId);
}
// 只查询id字段增加速度,条件为 in materialIds
QueryWrapper<AiReportScienceFileMaterial> queryWrapper = new QueryWrapper<>();
queryWrapper.in("id", materialIds);
queryWrapper.select("id");
return queryWrapper;
}
/**
* 通过 分段id,和当前排序,获取下级 子段落 内容
* @param titleTextId 分段id
* @param fileMaterialMap 当前段落内容对象
* @return 下级 子段落 内容
*/
private String getSubContent(String titleTextId, Map<String, AiReportScienceFileMaterial> fileMaterialMap) {
AiReportScienceFileMaterial scienceFileMaterial = fileMaterialMap.get(titleTextId);
LambdaQueryWrapper<AiReportScienceFileMaterial> fileMaterialQuery = new LambdaQueryWrapper<>();
fileMaterialQuery.eq(AiReportScienceFileMaterial::getFileId, scienceFileMaterial.getFileId());
fileMaterialQuery.gt(AiReportScienceFileMaterial::getSort, scienceFileMaterial.getSort());
fileMaterialQuery.notIn(AiReportScienceFileMaterial::getContentType, "img","table");
fileMaterialQuery.orderByAsc(AiReportScienceFileMaterial::getSort);
fileMaterialQuery.last(" limit 3 ");
List<AiReportScienceFileMaterial> one = aiReportScienceFileMaterialService.list(fileMaterialQuery);
StringBuilder content = new StringBuilder();
if (CollUtil.isNotEmpty(one)) {
for (AiReportScienceFileMaterial fileMaterial : one) {
content.append("\n").append(HtmlUtil.cleanHtmlTag(fileMaterial.getContent()));
if ("100".equals(fileMaterial.getLevel())) {
break;
}
}
}
return content.toString();
}
/**
* 通过 textId 获取该段落上下文内容
*/
@Override
public Map<String, Object> getContextByTextId(String textId) {
if (StringUtils.isBlank(textId)) {
return null;
}
// 获取当前分割的段落内容,主要用于获取序号和
AiReportScienceFileMaterial fileMaterial = aiReportScienceFileMaterialService.getById(textId);
if (fileMaterial == null) {
throw new IllegalArgumentException("素材文件已被删除或重新发起审核");
}
LambdaQueryWrapper<AiReportScienceFile> queryWrapper = new LambdaQueryWrapper<>();
queryWrapper.in(AiReportScienceFile::getId, fileMaterial.getFileId());
queryWrapper.select(AiReportScienceFile::getId,AiReportScienceFile::getAuthor,AiReportScienceFile::getPublishDate,
AiReportScienceFile::getTitle,AiReportScienceFile::getOrigin,AiReportScienceFile::getDataType);
AiReportScienceFile scienceFile = aiReportScienceFileService.getOne(queryWrapper, false);
String content = "";
// 如果是期刊类型,返回整个文章
if (scienceFile.getDataType() == 0) {
content = getPeriodicalContent(fileMaterial);
} else {
// 其他文件类型
content = getContent(fileMaterial);
}
Map<String, Object> map = new HashMap<>();
map.put("fileName", scienceFile.getTitle());
map.put("content", content);
map.put("origin", scienceFile.getOrigin());
map.put("author", scienceFile.getAuthor());
map.put("publishDate", scienceFile.getPublishDate());
return map;
}
/**
* 获取期刊内容
*/
public String getPeriodicalContent(AiReportScienceFileMaterial fileMaterial) {
QueryWrapper<AiReportScienceFileMaterial> queryWrapper = new QueryWrapper<>();
queryWrapper.eq("file_id", fileMaterial.getFileId());
queryWrapper.orderByAsc("sort");
queryWrapper.select("content");
List<AiReportScienceFileMaterial> list = aiReportScienceFileMaterialService.list(queryWrapper);
if (CollectionUtil.isNotEmpty(list)) {
StringBuilder sb = new StringBuilder();
int startWithNum = startWithNum(list);
int length = list.size();
for (int i = 0; i < length; i++) {
AiReportScienceFileMaterial aiReportScienceFileMaterial = list.get(i);
String content = HtmlUtil.cleanHtmlTag(aiReportScienceFileMaterial.getContent());
// 如果出现 参考文献 ,之后的不在拼接,且文字内容需要小于10. <h1>参考文献</h1> ,前后有9个字符html标签
if (content.contains("参考文献") && content.length() < 10) {
break;
}
// 从判断开始位置开始添加
if (i>=startWithNum) {
sb.append(HtmlUtil.removeAllHtmlAttr(aiReportScienceFileMaterial.getContent(),"p","span"));
}
}
return sb.toString();
}
return "";
}
/**
* 获取期刊时,判断从第几段开始取 content 值
*/
public int startWithNum(List<AiReportScienceFileMaterial> list) {
if (CollectionUtil.isEmpty(list)) {
return 0;
}
int num = 0;
boolean flag = false;
for (AiReportScienceFileMaterial aiReportScienceFileMaterial : list) {
String content = aiReportScienceFileMaterial.getContent();
if (content.contains("摘要")) {
flag = true;
break;
}
num++;
}
// 如果没有摘要 或者 摘要出现在中间,那么就返回全篇文章
if (!flag || (double) num / list.size() > 0.5) {
num = 0;
}
return num;
}
/**
* 其他文件素材获取内容
*/
public String getContent(AiReportScienceFileMaterial fileMaterial) {
int totalSuggestContentNum = HtmlUtil.cleanHtmlTag(fileMaterial.getContent()).length();
List<AiReportScienceFileMaterial> list = new ArrayList<>();
list.add(fileMaterial);
if (totalSuggestContentNum < 4000) {
Integer firstSort = fileMaterial.getSort();
Integer lastSort = fileMaterial.getSort();
List<AiReportScienceFileMaterial> listBySort = getListBySort(fileMaterial.getFileId(), firstSort, lastSort);
Map<Integer, AiReportScienceFileMaterial> collect = listBySort.stream().collect(Collectors.toMap(AiReportScienceFileMaterial::getSort, v -> v));
while (totalSuggestContentNum < 4000) {
AiReportScienceFileMaterial firstFileMaterial = collect.get(--firstSort);
AiReportScienceFileMaterial lastFileMaterial = collect.get(++lastSort);
if (firstFileMaterial == null && lastFileMaterial == null) {
break;
}
if (firstFileMaterial != null) {
totalSuggestContentNum += HtmlUtil.cleanHtmlTag(firstFileMaterial.getContent()).length();
list.add(firstFileMaterial);
}
if (lastFileMaterial != null) {
totalSuggestContentNum += HtmlUtil.cleanHtmlTag(lastFileMaterial.getContent()).length();
list.add(lastFileMaterial);
}
}
list.sort(Comparator.comparingInt(AiReportScienceFileMaterial::getSort));
}
StringBuilder stringBuilder = new StringBuilder();
if (!CollectionUtil.isEmpty(list)) {
for (AiReportScienceFileMaterial scienceFileMaterial : list) {
stringBuilder.append(HtmlUtil.removeAllHtmlAttr(scienceFileMaterial.getContent(), "p","span"));
}
}
return stringBuilder.toString();
}
public List<AiReportScienceFileMaterial> getListBySort(String fileId, int firstSort, int lastSort) {
QueryWrapper<AiReportScienceFileMaterial> queryWrapper = new QueryWrapper<>();
queryWrapper.eq("file_id", fileId);
queryWrapper.orderByAsc("sort");
int min = Math.max(0, firstSort - 25);
int max = Math.min(lastSort + 45, Integer.MAX_VALUE);
queryWrapper.between("sort", min, max);
return aiReportScienceFileMaterialService.list(queryWrapper);
}
@Override
public void deleteDataByFileId(String id) {
Map<String, Object> param = new HashMap<>();
param.put("knowledge_base_id", "yjzx_books_vdb");
param.put("fileId", new String[]{id});
try {
HttpUtil.doPost(BASE_URL + "books/books_delete", new JSONObject(param), 10000);
} catch (IOException e) {
log.error(e.getMessage(), e);
}
}
private void deleteDataByContentIds(List<String> contentIds) {
try {
if (CollectionUtil.isNotEmpty(contentIds)) {
// 删除知识库
Map<String, Object> param = new HashMap<>();
param.put("knowledge_base_id", "yjzx_books_vdb");
param.put("textId", contentIds.toArray());
try {
// 推送数据入kafka
//TODO 推送数据入kafka
//ReportUtil.sendKafka(contentIds);
} catch (Exception e) {
log.error("数据推送kafka失败", e);
}
try {
HttpUtil.doPost(BASE_URL + "books/books_delete", new JSONObject(param), 10000);
} catch (IOException e) {
log.error("向量库数据删除失败", e);
}
}
} catch (Exception e) {
log.error("删除素材推荐数据失败");
}
}
/**
* {
* "knowledge_base_id": "yjzx_books_test",
* "question": "亚马逊科技能带来什么",
* "history": [],
* "llm_answer": false,
* "score_threshold":600,
* "vector_search_top_k": 16384,
* "type": [],
* "ids": []
* }
*
*/
private JSONArray postSuggest(String text, String type) {
String[] types = type != null ? type.split(",") : new String[]{};
Map<String, Object> param = new HashMap<>();
param.put("question", text);
param.put("knowledge_base_id", "yjzx_books_vdb");
// param.put("llm_answer", false);
param.put("score_threshold", 500);
param.put("vector_search_top_k", 500);
param.put("type", types);
// param.put("ids",new String[]{});
try {
String responseStr = HttpUtil.doPost(BASE_URL + "books/books_chat", new JSONObject(param), 10000);
JSONObject jsonObject = JSON.parseObject(responseStr);
if (jsonObject.getInteger("code") == 200) {
return jsonObject.getJSONArray("results");
}
} catch (Exception e) {
e.printStackTrace();
}
return new JSONArray();
}
@Override
public List<String> getAuditPersonList() {
LambdaQueryWrapper<AiReportScienceFile> queryWrapper = new LambdaQueryWrapper<>();
queryWrapper.eq(AiReportScienceFile::getDeleted, 0);
queryWrapper.isNotNull(AiReportScienceFile::getAuditBy);
queryWrapper.groupBy(AiReportScienceFile::getAuditBy);
queryWrapper.select(AiReportScienceFile::getAuditBy);
return this.list(queryWrapper).stream().map(AiReportScienceFile::getAuditBy).collect(Collectors.toList());
}
//// @Override
// public Boolean uploadAiMaterialOld(AiReportScienceFile aiReportScienceFile, HttpServletRequest request) throws Exception {
// try {
// int i = 0;
// int j = 0;
// MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
// MultiValueMap<String, MultipartFile> files = multipartRequest.getMultiFileMap();
// LinkedList<MultipartFile> aa = (LinkedList<MultipartFile>) files.get("file");
// for (MultipartFile file : aa) {
// try{
// i++;
// String fileName = file.getOriginalFilename();//获取文件名
// String prefix = fileName.substring(fileName.lastIndexOf(".") + 1);
//
// String author = "";
// AiReportScienceFile reportScienceFile = new AiReportScienceFile();
// if (fileName.contains("_")) {
// reportScienceFile.setTitle(fileName.substring(0, fileName.lastIndexOf("_")));
// author = fileName.substring(fileName.lastIndexOf("_")+1,fileName.lastIndexOf("."));
// } else if (fileName.contains(".")) {
// reportScienceFile.setTitle(fileName.substring(0, fileName.indexOf(".")));
// }
// if (!"docx".equals(prefix) && !"doc".equals(prefix) && !"pdf".equals(prefix)) {
// throw new Exception("目前支持doc、docx、txt、pdf格式");
// }
//// QueryWrapper query = new QueryWrapper();
//// query.eq("title", reportScienceFile.getTitle());
//// if(!author.equals("")){
//// query.like("author", author);
//// }
//// List<AiReportScienceFile> list = aiReportScienceFileService.list(query);
//// if (list.size() == 0 || (
//// StringUtils.isNotEmpty(list.get(0).getContent()) && StringUtils.isNotEmpty(list.get(0).getFilePathObs())
//// )) {
//// System.out.println(i + ":标题:" + reportScienceFile.getTitle() + "-已拆分");
//// continue;
//// }
// j++;
// if (file.getOriginalFilename().endsWith("docx") || file.getOriginalFilename().endsWith("doc")) {
// //文件上传
// String text = DocUtil.convertDocStream2Html(file.getInputStream());
// reportScienceFile.setContent(text);
// ReportUtil.formatFile(reportScienceFile, prefix,file);
//// if(StringUtils.isNotEmpty(list.get(0).getContent()) && StringUtils.isEmpty(list.get(0).getFilePathObs())){
//// list.get(0).setFilePathObs(reportScienceFile.getFilePathObs());
//// aiReportScienceFileService.updateById(list.get(0));
//// System.out.println(j + ":标题:" + reportScienceFile.getTitle() + "-更新文件地址");
//// continue;
//// }
// System.out.println(j + ":标题:" + reportScienceFile.getTitle() + "-开始拆分");
// /*
// 获取目录
// */
// FileInputStream fileInputStream = (FileInputStream) file.getInputStream();
// XWPFDocument document = new XWPFDocument(fileInputStream);
// List<String> directoryList = new ArrayList<>();
// ReportUtil.setNoDirectory(document, directoryList);
//
// InputStream inputStream = null;
// java.io.ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
// document.write(byteArrayOutputStream);
// inputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray());
// String content = DocUtil.convertDocStream2Html(inputStream);
// content = content.replaceAll("__space__one1", "<h1>");
// content = content.replaceAll("__space__one2", "</h1>");
// content = content.replaceAll("__space__two1", "<h2>");
// content = content.replaceAll("__space__two2", "</h2>");
// content = content.replaceAll("__space__three1", "<h3>");
// content = content.replaceAll("__space__three2", "</h3>");
// content = content.replaceAll("&nbsp;", "");
// if(content.contains("<body") && content.contains("</body>")){
// content = content.substring(content.indexOf("<body"), content.indexOf("</body>"));
// }
//
//
// /*
// 美化内容(去空格、段落美化)
// */
// Elements elements = ReportUtil.getDirectory(content);
//// List<String> contentList = ReportUtil.beautifyContent(content);//zgz
// List<String> contentList = ReportUtil.beautifyContent2(elements);//lxp
//
// reportScienceFile.setContent(String.join("", contentList));
// reportScienceFile.setDeleted("0");
// reportScienceFile.setPublishDate(DateUtil.getStringDate(new Date()));
// reportScienceFile.setStatus(0);
// reportScienceFile.setDataType(aiReportScienceFile.getDataType());
//
// System.out.println(j + ":标题:" + reportScienceFile.getTitle() + "-拆分完成");
// String fileId = "";
// Integer fileType = reportScienceFile.getDataType();
// String origin = "";
//// if (list.size() > 0) {
//// list.get(0).setContent(reportScienceFile.getContent());
//// list.get(0).setDeleted("0");
//// list.get(0).setFilePathObs(reportScienceFile.getFilePathObs());
//// list.get(0).setPreviewObs(reportScienceFile.getPreviewObs());
//// list.get(0).setCoverObs(reportScienceFile.getCoverObs());
//// fileId = list.get(0).getId();
//// fileType = list.get(0).getDataType();
//// origin = list.get(0).getOrigin();
//// aiReportScienceFileService.updateById(list.get(0));
//// } else {
//// aiReportScienceFileService.save(reportScienceFile);
//// fileId = reportScienceFile.getId();
//// }
// aiReportScienceFileService.save(reportScienceFile);
// fileId = reportScienceFile.getId();
//
// /*
// 获取所有段落层级数据
// */
// List<Map<String, Object>> li = ReportUtil.getList(contentList);
//
// /*
// 删除历史素材
// */
// QueryWrapper queryOld = new QueryWrapper();
// queryOld.eq("file_id", fileId);
// aiReportScienceFileMaterialService.remove(queryOld);
// // 输出提取的内容
// List<AiReportScienceFileMaterial> listMaterial = new ArrayList<>();
// for (Map<String, Object> str : li) {
// AiReportScienceFileMaterial aiReportScienceFileMaterial = new AiReportScienceFileMaterial();
// aiReportScienceFileMaterial.setId(str.get("id").toString());
// aiReportScienceFileMaterial.setParent(str.get("parent").toString());
// aiReportScienceFileMaterial.setContentType(str.get("contentType").toString());
// aiReportScienceFileMaterial.setFileId(fileId);
// aiReportScienceFileMaterial.setContent(str.get("text").toString());
// aiReportScienceFileMaterial.setLevel(String.valueOf(str.get("level")));
// aiReportScienceFileMaterial.setSort((Integer) str.get("sort"));
// aiReportScienceFileMaterial.setDeleted("0");
// listMaterial.add(aiReportScienceFileMaterial);
// }
// aiReportScienceFileMaterialService.saveBatch(listMaterial);
// QueryWrapper query1 = new QueryWrapper();
// query1.eq("file_id", fileId);
// query1.eq("deleted", "0");
// query1.ne("content_type","img");
// query1.ne("content_type","table");
// List<AiReportScienceFileMaterial> listAiReportScienceFileMaterial = aiReportScienceFileMaterialService.list(query1);
//
// /*
// 素材kafka推送
// */
// ReportUtil.sendKafka(listAiReportScienceFileMaterial, fileId, fileType,origin);
//
// byteArrayOutputStream.close();
// document.close();
// fileInputStream.close();
// inputStream.close();
// } else if (file.getOriginalFilename().endsWith("pdf")) {
//
// } else {
// System.out.println("上传文件类型错误!");
// }
// }catch (Exception e){
// System.out.println("处理异常!跳过");
// }
// }
// } catch (Exception e) {
// log.error(e.getMessage());
// }
// return true;
// }
@Override
public Boolean uploadAiMaterial(AiReportScienceFile aiReportScienceFile, HttpServletRequest request) throws Exception {
try {
int i = 0;
int j = 0;
long b=System.currentTimeMillis();
MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
MultiValueMap<String, MultipartFile> files = multipartRequest.getMultiFileMap();
LinkedList<MultipartFile> aa = (LinkedList<MultipartFile>) files.get("file");
for (MultipartFile file : aa) {
try{
i++;
String fileName = file.getOriginalFilename();//获取文件名
String prefix = fileName.substring(fileName.lastIndexOf(".") + 1);
String author = "";
AiReportScienceFile reportScienceFile = new AiReportScienceFile();
if (fileName.contains("_")) {
reportScienceFile.setTitle(fileName.substring(0, fileName.lastIndexOf("_")));
author = fileName.substring(fileName.lastIndexOf("_")+1,fileName.lastIndexOf("."));
if (author!=null && author.length()>0) {
author = author.replaceAll("\\(1\\)","").replaceAll("\\(2\\)","")
.replaceAll("\\(3\\)","").trim();
}
} else if (fileName.contains("-")) {
reportScienceFile.setTitle(fileName.substring(0, fileName.lastIndexOf("-")));
author = fileName.substring(fileName.lastIndexOf("-")+1,fileName.lastIndexOf("."));
if (author!=null && author.length()>0) {
author = author.replaceAll("\\(1\\)","").replaceAll("\\(2\\)","")
.replaceAll("\\(3\\)","").trim();
}
} else if (fileName.contains(".")) {
String title = fileName.substring(0, fileName.indexOf("."));
reportScienceFile.setTitle(fileName.substring(0, fileName.indexOf(".")));
}
//基于文件规则,优化文件名
if (reportScienceFile.getTitle().contains("_")) {
String[] titleArr = reportScienceFile.getTitle().split("_");
String title = "";
for (String tt : titleArr) {
if (tt.length()==0){
continue;
}
if (tt.endsWith("国资委")) {
title +=tt+":";
} else {
title +=tt+" ";
}
}
reportScienceFile.setTitle(title.trim());
}
//基于文件规则,优化文件名
if (reportScienceFile.getTitle().contains("—")) {
String title = reportScienceFile.getTitle().substring(0, reportScienceFile.getTitle().indexOf("—")).trim();
reportScienceFile.setTitle(title);
}
if (reportScienceFile.getTitle().contains("...")) {
String title = reportScienceFile.getTitle().substring(0, reportScienceFile.getTitle().indexOf("..."));
reportScienceFile.setTitle(title);
}
if (!"docx".equals(prefix) && !"doc".equals(prefix) && !"pdf".equals(prefix)) {
throw new Exception("目前支持doc、docx、txt、pdf格式");
}
/*
//期刊的特殊逻辑,不需要
QueryWrapper<AiReportScienceFile> query = new QueryWrapper<>();
query.select("id");
// query.eq("title", reportScienceFile.getTitle());
//有多个空格的换成一个空格
String titleRes = reportScienceFile.getTitle().replaceAll("\\s+", " ");
String titleReNoSpace = reportScienceFile.getTitle().replaceAll("\\s+", "");
query.and((w-> w.like("title", titleRes).or().like("title", titleReNoSpace)));
if(!author.equals("") && author.length()<=5){
query.like("author", author);
}
List<AiReportScienceFile> list = aiReportScienceFileService.list(query);
System.out.println("时间2 : "+(System.currentTimeMillis()-b)/1000f+" 秒 ");
// if(list.size()==0 || (list.get(0).getContent()!=null && list.get(0).getContent().length()>0)){
// if (list.size()!=0) {
// System.out.println(i + ":标题:" + list.get(0).getTitle() + "-已处理过");
// }
//
// continue;
// }
if(list.size()==0 || (list.size()>1)){//list.size = 1时才继续往下执行
if (list.size()!=0) {
log.error(i + ":标题:" + fileName + "-已处理过或查询到太多");
} else {
log.error(i + ":标题:" + fileName + "-找不到");
}
continue;
} else {
AiReportScienceFile aiReport = aiReportScienceFileService.getById(list.get(0).getId());
System.out.println("时间3 : "+(System.currentTimeMillis()-b)/1000f+" 秒 ");
if (aiReport!=null && aiReport.getContent()!=null && aiReport.getContent().length()>0) {
log.error(i + ":标题:" + fileName + "-已处理过或查询到太多");
continue;
}
}
*/
// if(list.size()==0 || (list.size()>1 || (list.get(0).getContent()!=null && list.get(0).getContent().length()>0))){
// if (list.size()!=0) {
// log.error(i + ":标题:" + fileName + "-已处理过或查询到太多");
// System.out.println(i + ":标题:" + fileName + "-已处理过");
// } else {
// log.error(i + ":标题:" + fileName + "-找不到");
// System.out.println(i + ":标题:" + reportScienceFile.getTitle() + "-找不到");
// }
//
// continue;
// }
j++;
if (file.getOriginalFilename().endsWith("docx") || file.getOriginalFilename().endsWith("doc")) {
//文件上传
ReportUtil.formatFile(reportScienceFile, prefix,file);
System.out.println(j + ":标题:" + reportScienceFile.getTitle() + "-开始拆分");
//获取目录
InputStream fileInputStream = file.getInputStream();
XWPFDocument document = new XWPFDocument(fileInputStream);
String content = ReportUtil.getwordHtml(document);
List<String> contentList = ReportUtil.beautifyContent2(content);//lxp
reportScienceFile.setContent(String.join("", contentList));
reportScienceFile.setDeleted("0");
reportScienceFile.setPublishDate(DateUtil.getStringDate(new Date()));
reportScienceFile.setStatus(0);
reportScienceFile.setDataType(aiReportScienceFile.getDataType());
System.out.println(j + ":标题:" + reportScienceFile.getTitle() + "-拆分完成");
log.error(j + ":标题:" + fileName + "-处理成功");
String fileId = "";
Integer fileType = reportScienceFile.getDataType();
String origin = "";
/*
//期刊特殊逻辑
if (list.size() > 0) {
list.get(0).setContent(reportScienceFile.getContent());
list.get(0).setDeleted("0");
list.get(0).setFilePathObs(reportScienceFile.getFilePathObs());
list.get(0).setPreviewObs(reportScienceFile.getPreviewObs());
list.get(0).setCoverObs(reportScienceFile.getCoverObs());
fileId = list.get(0).getId();
fileType = list.get(0).getDataType();
origin = list.get(0).getOrigin();
aiReportScienceFileService.updateById(list.get(0));
} else {
aiReportScienceFileService.save(reportScienceFile);
fileId = reportScienceFile.getId();
}
*/
aiReportScienceFileService.save(reportScienceFile);
fileId = reportScienceFile.getId();
//获取所有段落层级数据
List<Map<String, Object>> li = ReportUtil.getList(contentList);
//删除历史素材
QueryWrapper queryOld = new QueryWrapper();
queryOld.eq("file_id", fileId);
aiReportScienceFileMaterialService.remove(queryOld);
// 输出提取的内容
List<AiReportScienceFileMaterial> listMaterial = new ArrayList<>();
for (Map<String, Object> str : li) {
AiReportScienceFileMaterial aiReportScienceFileMaterial = new AiReportScienceFileMaterial();
aiReportScienceFileMaterial.setId(str.get("id").toString());
aiReportScienceFileMaterial.setParent(str.get("parent").toString());
aiReportScienceFileMaterial.setContentType(str.get("contentType").toString());
aiReportScienceFileMaterial.setFileId(fileId);
aiReportScienceFileMaterial.setContent(str.get("text").toString());
aiReportScienceFileMaterial.setLevel(String.valueOf(str.get("level")));
aiReportScienceFileMaterial.setSort((Integer) str.get("sort"));
aiReportScienceFileMaterial.setDeleted("0");
listMaterial.add(aiReportScienceFileMaterial);
}
aiReportScienceFileMaterialService.saveBatch(listMaterial);
QueryWrapper query1 = new QueryWrapper();
query1.eq("file_id", fileId);
query1.eq("deleted", "0");
query1.ne("content_type","img");
query1.ne("content_type","table");
List<AiReportScienceFileMaterial> listAiReportScienceFileMaterial = aiReportScienceFileMaterialService.list(query1);
//素材kafka推送
// ReportUtil.sendKafka(listAiReportScienceFileMaterial, fileId, fileType,origin,"0");//0是未审核
document.close();
fileInputStream.close();
} else if (file.getOriginalFilename().endsWith("pdf")) {
} else {
System.out.println("上传文件类型错误!");
}
}catch (Exception e){
System.out.println("处理异常!跳过");
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
return true;
}
}
package com.zzsn.knowbase.util;
import java.text.SimpleDateFormat;
import java.util.*;
public class DateUtil {
static SimpleDateFormat format;
public static String dateToString(Date date) {
format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return format.format(date);
}
public static String getStringDate(Date date){
String format = "yyyy-MM-dd";
return DateUtil.format(date, format);
}
public static String format(Date d, String format)
{
if (d == null)
return "";
SimpleDateFormat myFormatter = new SimpleDateFormat(format);
myFormatter.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai"));
return myFormatter.format(d);
}
/**
* 获取每天的开始时间 00:00:00:00
*
* @param date
* @return
*/
public static Date getStartTime(Date date) {
Calendar dateStart = Calendar.getInstance();
dateStart.setTime(date);
dateStart.set(Calendar.HOUR_OF_DAY, 0);
dateStart.set(Calendar.MINUTE, 0);
dateStart.set(Calendar.SECOND, 0);
return dateStart.getTime();
}
/**
* 获取每天的结束时间 23:59:59:999
*
* @param date
* @return
*/
public static Date getEndTime(Date date) {
Calendar dateEnd = Calendar.getInstance();
dateEnd.setTime(date);
dateEnd.set(Calendar.HOUR_OF_DAY, 23);
dateEnd.set(Calendar.MINUTE, 59);
dateEnd.set(Calendar.SECOND, 59);
return dateEnd.getTime();
}
/**
* 获取当前年份
* @return
*/
public static String getSysYear() {
Calendar date = Calendar.getInstance();
return String.valueOf(date.get(Calendar.YEAR));
}
/**
* 获取当前时间所在日
*/
public static Integer getDay(String dateStr) {
Calendar instance = Calendar.getInstance();
Date date = stringToDate(dateStr, "yyyy-MM-dd");
instance.setTime(date);
return instance.get(Calendar.DAY_OF_MONTH);
}
/**
* 获取前一小时的时间
* @param date 时间
*/
public static Date beforeOneHour(Date date){
Calendar instance = Calendar.getInstance();
instance.setTime(date);
instance.add(Calendar.HOUR_OF_DAY,-1);
return instance.getTime();
}
public static Date stringToDate(String date,String formatStr) {
Date parse = null;
format = new SimpleDateFormat(formatStr);
try {
parse = format.parse(date);
} catch (Exception e) {
e.printStackTrace();
}
return parse;
}
public static List<String> betweenDate(String startDate,String endDate) {
List<String> dateList = new ArrayList<>();
String formatStr = "yyyy-MM-dd";
try {
Date dayOne = stringToDate(startDate, formatStr);
Date dayTwo = stringToDate(endDate, formatStr);
Calendar calendar = Calendar.getInstance();
calendar.setTime(dayOne);
dateList.add(startDate);
while (dayTwo.after(calendar.getTime())) {
calendar.add(Calendar.DAY_OF_MONTH,1);
dateList.add(format(calendar.getTime(),formatStr));
}
} catch (Exception e) {
e.printStackTrace();
}
return dateList;
}
/**
* 获取天前凌晨时间
* @param number 几天前
*/
public static Date getSomeDayBegin(int number) {
// 获取当前的日期和时间
Calendar calendar = Calendar.getInstance();
// 将当前的日期和时间减去一天
calendar.add(Calendar.DAY_OF_MONTH, -number);
// 将时间部分设置为凌晨时间
calendar.set(Calendar.HOUR_OF_DAY, 0);
calendar.set(Calendar.MINUTE, 0);
calendar.set(Calendar.SECOND, 0);
calendar.set(Calendar.MILLISECOND, 0);
// 获取昨天凌晨时间
return new Date(String.valueOf(calendar.getTime()));
}
}
package com.zzsn.knowbase.util;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
/**
* 在线程的run方法中使用注解@autoware注入的bean,会报空指针异常,原因是因为线程中为了线程安全,防注入。
* 获取bean实例的工具类组件
*/
@Component
public class GetBeanUtil implements ApplicationContextAware {
//Spring应用上下文环境
private static ApplicationContext applicationContext;
/**
* 实现ApplicationContextAware接口的回调方法,设置上下文环境
*/
public void setApplicationContext(ApplicationContext context) {
GetBeanUtil.applicationContext = context;
}
/**
* 获取对象 这里重写了bean方法,起主要作用
*/
public static Object getBean(String name) {
return applicationContext.getBean(name);
}
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
}
package com.zzsn.knowbase.util;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.*;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.EntityUtils;
import org.springframework.util.CollectionUtils;
import javax.net.ssl.SSLContext;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.*;
/**
* @Description: Http工具类
* @Author: zhangshuo
* @Date: 2021-06-08
* @Version: V1.0
*/
public class HttpUtil {
private static final CloseableHttpClient httpClient;
private static final String CHARSET = "utf-8";
// 采用静态代码块,初始化超时时间配置,再根据配置生成默认httpClient对象
static {
RequestConfig config = RequestConfig.custom().setConnectTimeout(10000).setSocketTimeout(10000).build();
httpClient = HttpClientBuilder.create().setDefaultRequestConfig(config).build();
}
/**
* HTTP Get 获取内容
*
* @param url 请求的url地址
* @param params 请求的参数
* @param charset 编码格式
* @return 页面内容
*/
public static String doGet(String url, Map<String, String> params, String charset) {
if (StringUtils.isBlank(url)) {
return null;
}
try {
if (params != null && !params.isEmpty()) {
List<NameValuePair> pairs = new ArrayList<NameValuePair>(params.size());
for (Map.Entry<String, String> entry : params.entrySet()) {
String value = entry.getValue();
if (value != null) {
pairs.add(new BasicNameValuePair(entry.getKey(), value));
}
}
// 将请求参数和url进行拼接
url += "?" + EntityUtils.toString(new UrlEncodedFormEntity(pairs, charset));
}
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != 200) {
httpGet.abort();
throw new RuntimeException("HttpClient,error status code :" + statusCode);
}
HttpEntity entity = response.getEntity();
String result = null;
if (entity != null) {
result = EntityUtils.toString(entity, "utf-8");
}
EntityUtils.consume(entity);
response.close();
return result;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* HTTP Post 获取内容
*
* @param url 请求的url地址
* @return 页面内容
* @throws IOException
*/
public static String doPost(String url, JSONObject jsonObject, int ExTime)
throws IOException {
HttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(ExTime).setConnectTimeout(ExTime).build();
HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("Content-Type", "application/json;charset=UTF-8");
httpPost.setHeader("Accept", "application/json");
httpPost.setConfig(requestConfig);
StringEntity se = new StringEntity(jsonObject.toJSONString(), "utf-8");
se.setContentType("application/json");
httpPost.setEntity(se);
HttpResponse response = httpclient.execute(httpPost);
String result = EntityUtils.toString(response.getEntity());
return result;
}
/**
* HTTP Post 获取内容
*
* @param url 请求的url地址
* @return 页面内容
* @throws IOException
*/
public static String doPostWithHeader(String url, JSONObject jsonObject, int ExTime ,Map<String,String> headers)
throws IOException {
HttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(ExTime).setConnectTimeout(ExTime).build();
HttpPost httpPost = new HttpPost(url);
if (!CollectionUtils.isEmpty(headers)) {
for (Map.Entry<String, String> entry : headers.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
httpPost.setHeader(key, value);
}
}
httpPost.setConfig(requestConfig);
StringEntity se = new StringEntity(jsonObject.toJSONString(), "utf-8");
se.setContentType("application/json");
httpPost.setEntity(se);
HttpResponse response = httpclient.execute(httpPost);
String result = EntityUtils.toString(response.getEntity());
return result;
}
/**
* HTTP Post 获取内容
*
* @param url 请求的url地址
* @return 页面内容
* @throws IOException
*/
public static String doPost(String url, JSONArray data, int ExTime)
throws IOException {
HttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(ExTime).setConnectTimeout(ExTime).build();
HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("Content-Type", "application/json;charset=UTF-8");
httpPost.setHeader("Accept", "application/json");
httpPost.setConfig(requestConfig);
StringEntity se = new StringEntity(data.toJSONString(), "utf-8");
se.setContentType("application/json");
httpPost.setEntity(se);
HttpResponse response = httpclient.execute(httpPost);
String result = EntityUtils.toString(response.getEntity());
return result;
}
/**
* HTTPS Get 获取内容(无SSL证书验证)
*
* @param url 请求的url地址
* @param params 请求的参数
* @param charset 编码格式
* @return 页面内容
*/
public static CloseableHttpResponse doGetSSL(String url, Map<String, String> params, String charset) {
if (StringUtils.isBlank(url)) {
return null;
}
try {
if (params != null && !params.isEmpty()) {
List<NameValuePair> pairs = new ArrayList<NameValuePair>(params.size());
for (Map.Entry<String, String> entry : params.entrySet()) {
String value = entry.getValue();
if (value != null) {
pairs.add(new BasicNameValuePair(entry.getKey(), value));
}
}
url += "?" + EntityUtils.toString(new UrlEncodedFormEntity(pairs, charset));
}
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
httpGet.setHeader(HttpHeaders.CONNECTION, "close");
// https 注意这里获取https内容,使用了忽略证书的方式,当然还有其他的方式来获取https内容
CloseableHttpClient httpsClient = createSSLClientDefault();
CloseableHttpResponse response = httpsClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != 200) {
httpGet.abort();
throw new RuntimeException("HttpClient,error status code :" + statusCode);
}
return response;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 这里创建了忽略整数验证的CloseableHttpClient对象
*
* @return SSLClientDefault
*/
public static CloseableHttpClient createSSLClientDefault() {
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
// 信任所有
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}
}).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
} catch (KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
e.printStackTrace();
}
return HttpClients.createDefault();
}
/**
* ip代理请求
*
* @throws Exception
*/
public static CloseableHttpResponse getProxyHttpClient(String url) {
//获取代理ip信息
//TODO
String proxyHost = "";
int proxyPort = 0;
String userName = "";
String password = "";
try {
//设置代理IP和端口并设置链接、传输时间
HttpHost proxy = new HttpHost(proxyHost, proxyPort);
RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectTimeout(60000).setSocketTimeout(60000).build();
//设置账号密码
CredentialsProvider provider = new BasicCredentialsProvider();
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(userName, password));
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(config)
.setDefaultCredentialsProvider(provider)
.build();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
httpGet.setHeader(HttpHeaders.CONNECTION, "close");
CloseableHttpResponse response = httpClient.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != 200) {
httpGet.abort();
throw new RuntimeException("HttpClient,error status code :" + statusCode);
}
return response;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static String sendPost1(String url, String data, Map<String,String> header) {
String response = null;
try {
CloseableHttpClient httpclient = null;
CloseableHttpResponse httpresponse = null;
try {
httpclient = HttpClients.createDefault();
HttpPost method = new HttpPost(url);
StringEntity stringentity = new StringEntity(data, Charset.forName("UTF-8"));
stringentity.setContentEncoding("UTF-8");
for(Map.Entry<String, String> item : header.entrySet()){
method.setHeader(item.getKey(), item.getValue());
}
method.setEntity(stringentity);
httpresponse = httpclient.execute(method);
response = EntityUtils.toString(httpresponse.getEntity());
} finally {
if (httpclient != null) {
httpclient.close();
}
if (httpresponse != null) {
httpresponse.close();
}
}
} catch (Exception e) {
// throw new Exception("http link fail", e);
e.printStackTrace();
}
return response;
}
/**
*
* @param httpUrl 请求的url
* @param param form表单的参数(key,value形式)
* @return
*/
public static String doPostForm(String httpUrl, Map param,Integer expire) {
HttpURLConnection connection = null;
InputStream is = null;
OutputStream os = null;
BufferedReader br = null;
String result = null;
try {
URL url = new URL(httpUrl);
// 通过远程url连接对象打开连接
connection = (HttpURLConnection) url.openConnection();
// 设置连接请求方式
connection.setRequestMethod("POST");
// 设置连接主机服务器超时时间:15000毫秒
connection.setConnectTimeout(expire);
// 设置读取主机服务器返回数据超时时间:60000毫秒
connection.setReadTimeout(expire);
// 默认值为:false,当向远程服务器传送数据/写数据时,需要设置为true
connection.setDoOutput(true);
// 默认值为:true,当前向远程服务读取数据时,设置为true,该参数可有可无
connection.setDoInput(true);
// 设置传入参数的格式:请求参数应该是 name1=value1&name2=value2 的形式。
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
// 设置鉴权信息:Authorization: Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0
//connection.setRequestProperty("Authorization", "Bearer da3efcbf-0845-4fe3-8aba-ee040be542c0");
// 通过连接对象获取一个输出流
os = connection.getOutputStream();
// 通过输出流对象将参数写出去/传输出去,它是通过字节数组写出的(form表单形式的参数实质也是key,value值的拼接,类似于get请求参数的拼接)
os.write(createLinkString(param).getBytes());
// 通过连接对象获取一个输入流,向远程读取
if (connection.getResponseCode() == 200) {
is = connection.getInputStream();
// 对输入流对象进行包装:charset根据工作项目组的要求来设置
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
StringBuffer sbf = new StringBuffer();
String temp = null;
// 循环遍历一行一行读取数据
while ((temp = br.readLine()) != null) {
sbf.append(temp);
sbf.append("\r\n");
}
result = sbf.toString();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭资源
if (null != br) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != os) {
try {
os.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != is) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 断开与远程地址url的连接
connection.disconnect();
}
return result;
}
/**
* 把数组所有元素排序,并按照“参数=参数值”的模式用“&”字符拼接成字符串
* @param params 需要排序并参与字符拼接的参数组
* @return 拼接后字符串
*/
public static String createLinkString(Map<String, String> params) {
List<String> keys = new ArrayList<String>(params.keySet());
Collections.sort(keys);
StringBuilder prestr = new StringBuilder();
for (int i = 0; i < keys.size(); i++) {
String key = keys.get(i);
String value = params.get(key);
if (i == keys.size() - 1) {// 拼接时,不包括最后一个&字符
prestr.append(key).append("=").append(value);
} else {
prestr.append(key).append("=").append(value).append("&");
}
}
return prestr.toString();
}
public static void postByte(String url, byte[] bytes,String filename) throws IOException {
post(url,new ByteArrayInputStream(bytes),filename);
}
public static void post(String url,InputStream inputStream ,String filename) throws IOException {
try {
//创建HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost(url);
MultipartEntityBuilder builder = MultipartEntityBuilder.create();
builder.setCharset(StandardCharsets.UTF_8);
builder.setMode(HttpMultipartMode.BROWSER_COMPATIBLE);
builder.setContentType(ContentType.MULTIPART_FORM_DATA);
//绑定文件参数,传入文件流和contenttype,此处也可以继续添加其他formdata参数
builder.addBinaryBody("file", inputStream, ContentType.MULTIPART_FORM_DATA,filename);//这里一定要给个名字,不然写null会接不到
HttpEntity entity = builder.build();
httpPost.setEntity(entity);
//执行提交
HttpResponse response = httpClient.execute(httpPost);
HttpEntity responseEntity = response.getEntity();
if(responseEntity != null){
//将响应的内容转换成字符串
String result = EntityUtils.toString(responseEntity, StandardCharsets.UTF_8);
//此处根据服务器返回的参数转换,这里返回的是JSON格式
JSONObject output = JSON.parseObject(result);
// System.out.println(output.toJSONString());
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String sendPost(String url, Map<String, Object> params,String charset,int ExTime) {
String content = "";
CloseableHttpClient httpClient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(ExTime).setConnectTimeout(ExTime).build();
HttpPost httpPost = new HttpPost(url);
JSONObject jsonObject = new JSONObject();
// 通过map集成entrySet方法获取entity
Set<Map.Entry<String, Object>> entrySet = params.entrySet();
// 循环遍历,获取迭代器
for (Map.Entry<String, Object> mapEntry : entrySet) {
if(mapEntry.getValue()!=null){
jsonObject.put(mapEntry.getKey(), mapEntry.getValue());
}
}
try {
if (null != params) {
//解决中文问题。
httpPost.addHeader("Content-type","application/json; charset=utf-8");
httpPost.setHeader("Accept", "application/json");
//System.out.println("urlEncodedFormEntity:" + urlEncodedFormEntity);
httpPost.setEntity(new StringEntity(jsonObject.toString(),"UTF-8"));
httpPost.setConfig(requestConfig);
}
System.out.println("execurting request:" + httpPost.getURI());
HttpResponse httpResponse = null;
httpResponse = httpClient.execute(httpPost);
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity != null) {
content = EntityUtils.toString(httpEntity, charset);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return content;
}
}
package com.zzsn.knowbase.util;
import java.security.MessageDigest;
public class MD5Util {
private static String byteArrayToHexString(byte[] b) {
StringBuilder resultSb = new StringBuilder();
for (byte value : b) {
resultSb.append(byteToHexString(value));
}
return resultSb.toString();
}
private static String byteToHexString(byte b) {
int n = b;
if (n < 0) {
n += 256;
}
int d1 = n / 16;
int d2 = n % 16;
return hexDigits[d1] + hexDigits[d2];
}
public static String MD5Encode(String origin, String charsetName) {
String resultString = null;
try {
resultString = origin;
MessageDigest md = MessageDigest.getInstance("MD5");
if (charsetName == null || "".equals(charsetName)) {
resultString = byteArrayToHexString(md.digest(resultString.getBytes()));
} else {
resultString = byteArrayToHexString(md.digest(resultString.getBytes(charsetName)));
}
} catch (Exception e) {
e.printStackTrace();
}
return resultString;
}
private static final String[] hexDigits = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"};
}
package com.zzsn.knowbase.util;
import com.github.tobato.fastdfs.proto.storage.DownloadByteArray;
import com.obs.services.ObsClient;
import com.obs.services.model.*;
//import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* Description: obs桶文件操作
* Author: EDY
* Date: 2023/10/9
*/
@Component
public class ObsUtil {
@Autowired
ObsClient obsClient;
/**桶名称*/
private String bucketName = "zzsn";
/**判断桶是否存在*/
public Boolean existsBucket(String bucket){
return obsClient.headBucket(bucket);
}
public Boolean existsBucket(){
return obsClient.headBucket(bucketName);
}
/**
* 创建文件夹本质上来说是创建了一个大小为0且对象名以“/”结尾的对象。
* 多级文件夹创建最后一级即可,比如src1/src2/src3/,创建src1/src2/src3/即可,无需创建src1/、src1/src2/。
* keySuffixWithSlash为文件夹名称,以 / 结尾
* */
public boolean mkdir(String keySuffixWithSlash){
PutObjectResult putObjectResult = obsClient.putObject(bucketName, keySuffixWithSlash, new ByteArrayInputStream(new byte[0]));
if (putObjectResult.getStatusCode()==200) {
return true;
}else {
return false;
}
}
/**查询桶内文件夹下所有文件
* folderPrefix 为文件夹名称,以 / 结尾
* */
public List<ObsObject> getPathFileList(String folderPrefix){
List<ObsObject> res = new ArrayList<>();
ListObjectsRequest request = new ListObjectsRequest(bucketName);
request.setPrefix(folderPrefix);
ObjectListing result = obsClient.listObjects(request);
for (ObsObject obsObject : result.getObjects()) {
res.add(obsObject);
}
return res;
}
/**
* 获取文件夹下的文件数量
* */
public Integer getCount (String folderPrefix){
ListObjectsRequest listObjectsRequest = new ListObjectsRequest(bucketName);
listObjectsRequest.setPrefix(folderPrefix);
listObjectsRequest.setMaxKeys(1000);
int fileCount = 0;
ObjectListing objectListing;
do {
objectListing = obsClient.listObjects(listObjectsRequest);
List<S3Object> objectSummaries = objectListing.getObjectSummaries();
fileCount += objectSummaries.size();
String nextMarker = objectListing.getNextMarker();
listObjectsRequest.setMarker(nextMarker);
} while (objectListing.isTruncated());
return fileCount;
}
/**删除桶内文件
* objectKey为文件路径,起始为桶内某文件夹,或者直接为桶内文件
* */
public boolean delFile (String objectKey){
DeleteObjectResult deleteObjectResult = obsClient.deleteObject(bucketName, objectKey);
if (deleteObjectResult.getStatusCode()==200) {
return true;
}else {
return false;
}
}
/**文件上传
* objectKey为文件路径
* */
public PutObjectResult uploadFile(String objectKey,byte[] bytes){
PutObjectResult putObjectResult = obsClient.putObject(bucketName, objectKey, new ByteArrayInputStream(bytes));
return putObjectResult;
}
/**文件上传
* objectKey为文件路径
* */
public PutObjectResult uploadFile(String objectKey ,InputStream inputStream){
PutObjectResult putObjectResult = obsClient.putObject(bucketName, objectKey, inputStream);
return putObjectResult;
}
/**
* 获取文件流
*
* */
public InputStream getObjectStream(String objectKey){
ObsObject obsObject = obsClient.getObject(bucketName, objectKey);
return obsObject.getObjectContent();
}
/**
* 获取文件流
*
* */
public byte[] getObjectByte(String objectKey){
ObsObject obsObject = obsClient.getObject(bucketName, objectKey);
// 获取文件的输入流
InputStream objectContent = obsObject.getObjectContent();
// 将输入流转换为byte[]
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int bytesRead;
while (true) {
try {
if (!((bytesRead = objectContent.read(buffer)) != -1)) {
break;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
byteArrayOutputStream.write(buffer, 0, bytesRead);
}
byte[] fileBytes = byteArrayOutputStream.toByteArray();
return fileBytes;
}
/*
* 文件预览
* @param fileName
*/
public boolean previewImg(HttpServletRequest request, HttpServletResponse response) throws IOException {
String filePath = request.getParameter("attachmentPath");
String group = request.getParameter("group");
if (StringUtils.isBlank(filePath)) {
return false;
}
DownloadByteArray downloadByteArray = new DownloadByteArray();
byte[] content = getObjectByte(filePath);
if (content == null || content.length == 0) {
return false;
}
response.addHeader("Pragma", "No-cache");
response.addHeader("Cache-Control", "no-store,No-cache");
response.setCharacterEncoding("UTF-8");
// response.setContentType("application/json;charset=utf-8");
String s = filePath.split("/")[filePath.split("/").length - 1];
String mimeType = request.getServletContext().getMimeType(s);
System.out.println("文件类型为" + mimeType);
response.setContentType(request.getServletContext().getMimeType(s) + ";charset=utf-8");
OutputStream out = response.getOutputStream();
BufferedOutputStream bos = new BufferedOutputStream(out);
try {
bos.write(content, 0, content.length);
bos.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bos != null) {
bos.close();
}
if (out != null) {
out.close();
}
}
return true;
}
}
package com.zzsn.knowbase.util;
import com.alibaba.fastjson.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.IdWorker;
import com.obs.services.model.PutObjectResult;
import com.zzsn.knowbase.constant.Constants;
import com.zzsn.knowbase.constant.DirEnum;
import com.zzsn.knowbase.entity.AiReportScienceFile;
import com.zzsn.knowbase.entity.AiReportScienceFileMaterial;
import com.zzsn.knowbase.vo.DocEntity;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Description: 报告素材工具类
* @Version: V1.0
*/
@Slf4j
public class ReportUtil {
@Autowired
private static ObsUtil obsUtil;
// @Resource
// private static StreamBridge streamBridge;
public static List<Map<String, Object>> getList(List<String> originList) {
Stack<Map<String,Object>> stack = new Stack<>();
Map<String, Object> map1 = new HashMap<>();
map1.put("id", null);
map1.put("text", null);
map1.put("level", 1000);
stack.push(map1);
List<Map<String,Object>> list = new ArrayList<>();
int sort = 1;
for (String text : originList) {
Map<String, Object> map = new HashMap<>();
map.put("id", IdWorker.getIdStr());
map.put("text", text);
map.put("level", getLevel(text));
getParent(map, stack);
if (canPush(map,stack)){
stack.push(map);
}
map.put("contentType", getContentType(text));
map.put("sort", sort++);
list.add(map);
}
stack.clear();
return list;
}
/**
* 通过传入的文本开头标签,判断文本类型 p/h1/h2/h3/h4/img/table
* @param text
* @return
*/
private static String getContentType(String text){
if (text.startsWith("<p")) {
return "p";
} else if (text.startsWith("<h1")) {
return "h1";
} else if (text.startsWith("<h2")) {
return "h2";
} else if (text.startsWith("<h3")) {
return "h3";
} else if (text.startsWith("<h4")) {
return "h4";
} else if (text.startsWith("<h5")) {
return "h5";
} else if (text.startsWith("<h6")) {
return "h6";
} else if (text.startsWith("<h7")) {
return "h7";
} else if (text.startsWith("<h8")) {
return "h8";
} else if (text.startsWith("<h9")) {
return "h9";
} else if (text.startsWith("<img")) {
return "img";
} else if (text.startsWith("<table")) {
return "table";
} else {
return "";
}
}
/**
* 通过传入的文本类型
* @param text
* @return
*/
private static String getTextType(String text){
if (text.equals("p")) {
return "内容";
}else if (text.contains("h")) {
return "标题";
}else if (text.equals("img")) {
return "图片";
}else if (text.equals("table")) {
return "表格";
}else {
return "内容";
}
}
/**
* 通过传入的文本类型
* @param type
* @return
*/
private static String getFileType(Integer type){
if (type == 0) {
return "期刊";
}else if (type == 1) {
return "博士论文";
}else if (type == 2) {
return "硕士论文";
}else if (type == 3) {
return "图书";
}else if (type == 4) {
return "研报";
}else {
return "期刊";
}
}
/**
* 判断源数据是否能放入栈中
* @param source 源数据
* @param stack 栈数据
* @return true or false
*/
private static boolean canPush(Map<String, Object> source, Stack<Map<String, Object>> stack){
if (stack.isEmpty()) {
return true;
}
String text = (String) source.get("text");
if (!text.startsWith("<h")) {
return false;
}
Map<String, Object> peek = stack.peek();
int level = (int) peek.get("level");
int sourceLevel = (int) source.get("level");
return sourceLevel > level;
}
/**
* 给文件赋值 parent 属性
* @param source
* @param stack
*/
private static void getParent(Map<String, Object> source, Stack<Map<String, Object>> stack){
if (stack.isEmpty()) {
source.put("parent", "0");
}
Map<String, Object> peek = stack.peek();
String id = (String) peek.get("id");
int level = (int) peek.get("level");
int sourceLevel = (int) source.get("level");
if (sourceLevel > level){
source.put("parent", id);
} else {
boolean f = false;
while (sourceLevel <= level) {
stack.pop();
if (stack.isEmpty()) {
source.put("parent", "0");
f = true;
break;
}
peek = stack.peek();
level = (int) peek.get("level");
}
if (!f) {
id = (String) peek.get("id");
source.put("parent", id);
}
}
}
/**
* 通过 h 标签的判断文本层级
* @param text
* @return
*/
private static Integer getLevel(String text) {
if (StringUtils.isBlank(text)) {
return null;
}
Matcher matcher = Pattern.compile("^(<h1>|<h2>|<h3>|<h4>|<h5>|<h6>|<h7>|<h8>|<h9>|<h1|<h2|<h3|<h4|<h5|<h6|<h7|<h8|<h9)").matcher(text);
if (matcher.find()) {
String tag = matcher.group();
switch (tag) {
case "<h1>":
case "<h1":
return 1;
case "<h2>":
case "<h2":
return 2;
case "<h3>":
case "<h3":
return 3;
case "<h4>":
case "<h4":
return 4;
case "<h5>":
case "<h5":
return 5;
case "<h6>":
return 6;
case "<h7>":
case "<h7":
return 7;
case "<h8>":
case "<h8":
return 8;
case "<h9>":
case "<h9":
return 9;
default:
return 100;
}
} else {
return 100;
}
}
public static Elements getDirectory(String text) {
// 解析HTML字符串
Document doc = Jsoup.parse(text);
// 提取所有的p标签、img标签和table标签
Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
Elements elements1 = new Elements();
String hh = "";
// 打印提取结果、去多余空格
//获取文章目录标题
int startflag=0;
int maxlever = 0;
Map<String, String> titleLev = new HashMap<String, String>();
for (Element element : elements) {
if(element.toString().contains("<img") || element.toString().contains("<table")){
elements1.add(element);
}else{
if(!element.text().equals("")){
String parentText = element.text().replaceAll("<p>","").replaceAll("</p>","");
String parentText2 = element.text().replaceAll("<p>","").replaceAll("</p>","");
/* if (parentText.contains("关键词") && startflag == 0) {
startflag = 1;
}*/
if(!calculateEnglishRatio(parentText)){
//判断是不是标题目录
if((parentText.matches("[一二三四五六七八九十]+.*") ||
parentText.startsWith("(") ||
parentText.startsWith("(") ||
parentText.matches("\\d+.*") ) &&
parentText.length()<50 &&
!parentText.startsWith("中图分类号")
){
String tKey = getTilteNum(parentText);
if (null==tKey || tKey.trim().length()==0) {
continue;
}
//获取目录的层级
String lever = titleLev.get(tKey);
if (null!=lever ) {
parentText = "<h"+lever+">"+parentText+"</h"+lever+">";
} else if (titleLev.size()==0) {
maxlever++;
titleLev.put(tKey,String.valueOf(maxlever));
parentText = "<h1>"+parentText+"</h1>";
} else {
maxlever++;
titleLev.put(tKey,String.valueOf(maxlever));
parentText = "<h"+maxlever+">"+parentText+"</h"+maxlever+">";
}
parentText = parentText.replaceAll(" ","").replaceAll("&nbsp;","");
}
if(parentText.startsWith("<h")){
String tag = parentText.substring(1,3);
element.tagName(tag);
element.html(parentText2);
}
}
elements1.add(element);
}
}
}
return elements1;
}
public static String getTilteNum(String title) {
String result = "";
if (Pattern.matches("\\d+", title)) {
return result ;
}
String regex = "\\d+\\D";
// 创建Pattern对象
Pattern pattern = Pattern.compile(regex);
// 创建Matcher对象
Matcher matcher = pattern.matcher(title);
if(title.matches("\\d+.*") ) {
while (matcher.find()) {
// 获取匹配到的文本
String resultnum = matcher.group();
resultnum = resultnum.substring(0, resultnum.length()-1);
if (resultnum.length()>5) {
resultnum = resultnum.substring(0,5);
}
Integer num = Integer.valueOf(resultnum);
if (num>20) {
return result ;
}
break;
}
StringBuilder result1 = new StringBuilder();
for (char c : title.toCharArray()) {
if (Character.isDigit(c) || c=='.') {
result1.append(c);
} else {
break;
}
}
result = String.valueOf(result1);
result = result.replaceAll("\\d", "1");
} else if (title.matches("[一二三四五六七八九十]+.*") ) {
if (title.contains("、") || title.contains(" ") || title.contains(".")) {
result = "一";
}
} else if (title.startsWith("(") ||
title.startsWith("(")) {
if (title.substring(1).matches("\\d+.*") ) {
Matcher matcher1 = pattern.matcher(title.substring(1));
while (matcher1.find()) {
// 获取匹配到的文本
String resultnum = matcher1.group();
resultnum = resultnum.substring(0, resultnum.length()-1);
if (resultnum.length()>5) {
resultnum = resultnum.substring(0,5);
}
Integer num = Integer.valueOf(resultnum);
if (num>20) {
return result ;
}
break;
}
result = "(1)";
} else if (title.substring(1).matches("[一二三四五六七八九十]+.*")) {
result = "(一)";
}
}
return result ;
}
public static void setNoDirectory(XWPFDocument doc, List<String> directoryList) {
String resulthtml="";
int i=0;
for (XWPFParagraph paragraph : doc.getParagraphs()) {
String text = paragraph.getText().trim().replaceAll(" ","");
String titleLvl = getTitleLvl(doc, paragraph);
if (StringUtils.isNotEmpty(titleLvl)) {
DocEntity docEntity = getDocEntity(paragraph, text, "0");
if(docEntity==null || StringUtils.isEmpty(docEntity.getName())
// || (docEntity.getParentId()!=null && docEntity.getParentId().equals("-1"))
){
continue;
}
// 在父节点的文本前后添加<h1>标签
String parentText = paragraph.getText().trim().replaceAll(" ","");
Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher = pattern.matcher(parentText);
Pattern pattern1 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher1 = pattern1.matcher(parentText);
Pattern pattern2 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher2 = pattern2.matcher(parentText);
Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher3 = pattern3.matcher(parentText);
if(parentText.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(parentText)) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
directoryList.add(docEntity.name);
}else if (parentText.matches("^\\d+\\.[^\\d].*")) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
directoryList.add(docEntity.name);
}else if (parentText.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__one1" + parentText + "__space__one2");
directoryList.add(docEntity.name);
}else if (parentText.matches("\\d+\\.\\d+.*")) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__two1" + parentText + "__space__two2");
directoryList.add(docEntity.name);
}else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__two1" + parentText + "__space__two2");
directoryList.add(docEntity.name);
}else if (parentText.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
paragraph.getCTP().setRArray(new CTR[]{});
paragraph.createRun().setText("__space__three1" + parentText + "__space__three2");
directoryList.add(docEntity.name);
}
}
}
}
public static String getwordHtmlOnlyText(XWPFDocument doc) {
String resulthtml="";
int i=0;
for (XWPFParagraph paragraph : doc.getParagraphs()) {
String text = paragraph.getText().trim();
if (StringUtils.isNotEmpty(text)) {
String titleLvl = getTitleLvl(doc, paragraph);
if (StringUtils.isNotEmpty(titleLvl)) {
DocEntity docEntity = getDocEntity(paragraph, text, "0");
if(docEntity==null || StringUtils.isEmpty(docEntity.getName()) || text.length()>50){
resulthtml = getHtml(resulthtml,text);
}else{
// 在父节点的文本前后添加<h1>标签
Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher = pattern.matcher(text);
Pattern pattern1 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher1 = pattern1.matcher(text);
Pattern pattern2 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher2 = pattern2.matcher(text);
Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher3 = pattern3.matcher(text);
if(text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text)) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("^\\d+\\.[^\\d].*")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("\\d+\\.\\d+.*")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h2>"+text+"</h2>\n";
}else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h2>"+text+"</h2>\n";
}else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h3>"+text+"</h3>\n";
}else{
resulthtml = getHtml(resulthtml,text);
}
}
}else{
resulthtml = getHtml(resulthtml,text);
}
}
}
return resulthtml;
}
public static String getwordHtml(XWPFDocument doc) throws IOException {
String resulthtml="";
int i=0;
List<IBodyElement> elements = doc.getBodyElements();
// 遍历元素
for (IBodyElement element : elements) {
// 判断元素类型
if (element instanceof XWPFParagraph) {
// 处理文本段落
XWPFParagraph paragraph = (XWPFParagraph) element;
// 遍历段落中的所有Run对象
List<XWPFRun> runs = paragraph.getRuns();
for (XWPFRun run : runs) {
// 获取嵌入的图片
List<XWPFPicture> pictures = run.getEmbeddedPictures();
for (XWPFPicture picture : pictures) {
// 获取图片的二进制数据和文件名
XWPFPictureData pictureData = picture.getPictureData();
byte[] imageData = pictureData.getData();
BufferedImage image = ImageIO.read(new ByteArrayInputStream(imageData));
int width = image.getWidth();
int height = image.getHeight();
if (width<80 || height<80) {
continue;
}
String fileName = pictureData.getFileName();
// 将图片转换为Base64编码的字符串
String base64Image = javax.xml.bind.DatatypeConverter.printBase64Binary(imageData);
// 设置HTML代码中的图片类型
String mimeType = pictureData.getFileName();
String imageType = mimeType.substring(mimeType.lastIndexOf(".") + 1);
// 生成HTML代码
resulthtml += "<img src=\"data:image/" + imageType + ";base64," + base64Image + "\" alt=\"" + fileName + "\"/>\n";
}
}
String text = paragraph.getText().trim();
if (StringUtils.isNotEmpty(text)) {
String titleLvl = getTitleLvl(doc, paragraph);
if (StringUtils.isNotEmpty(titleLvl)) {
DocEntity docEntity = getDocEntity(paragraph, text, "0");
if(docEntity==null || StringUtils.isEmpty(docEntity.getName()) || text.length()>50){
resulthtml = getHtml(resulthtml,text);
}else{
// 在父节点的文本前后添加<h1>标签
Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher = pattern.matcher(text);
Pattern pattern1 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher1 = pattern1.matcher(text);
Pattern pattern2 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher2 = pattern2.matcher(text);
Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher3 = pattern3.matcher(text);
if(text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text)) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("^\\d+\\.[^\\d].*")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h1>"+text+"</h1>\n";
}else if (text.matches("\\d+\\.\\d+.*")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h2>"+text+"</h2>\n";
}else if (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find()) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h2>"+text+"</h2>\n";
}else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
paragraph.getCTP().setRArray(new CTR[]{});
resulthtml += "<h3>"+text+"</h3>\n";
}else{
resulthtml = getHtml(resulthtml,text);
}
}
}else{
resulthtml = getHtml(resulthtml,text);
}
}
} else if (element instanceof XWPFTable) {
// 处理表格
XWPFTable table = (XWPFTable) element;
String tableHtml = convertTableToHtml(table);
resulthtml += tableHtml+"\n";
}
}
return resulthtml;
}
public static String getHtml(String resulthtml,String text){
//判断是不是数字,也就是是不是页码
String pattern = "\\d+";
if (Pattern.matches(pattern, text)) {
return resulthtml;
}
//判断是不是汉字
boolean isHanzi = false;
for (char c : text.toCharArray()) {
if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
isHanzi = true;
break;
}
}
if (isHanzi) {
text = text.replaceAll(" ","");
}
resulthtml += "<p>"+text+"</p>\n";
return resulthtml;
}
private static String convertTableToHtml(XWPFTable table) {
Document doc = new Document("");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
Element tableElement = doc.createElement("table");
doc.appendChild(tableElement);
// 遍历表格行
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
Element rowElement = doc.createElement("tr");
tableElement.appendChild(rowElement);
// 遍历行中的单元格
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
Element cellElement = doc.createElement("td");
rowElement.appendChild(cellElement);
// 设置单元格内容
String cellText = cell.getText();
cellElement.text(cellText);
// 设置单元格属性
int colspan = 1;
if (cell.getCTTc().getTcPr() != null && cell.getCTTc().getTcPr().getGridSpan() != null) {
colspan = cell.getCTTc().getTcPr().getGridSpan().getVal().intValue();
}
int rowspan = cell.getCTTc().getTcPr() != null && cell.getCTTc().getTcPr().getVMerge() != null ? 0 : 1;
cellElement.attr("colspan", String.valueOf(colspan));
cellElement.attr("rowspan", String.valueOf(rowspan));
}
}
return doc.html();
}
/**
* @param doc
* @param para
* @return
*/
private static String getTitleLvl(XWPFDocument doc, XWPFParagraph para) {
String titleLvl = "";
String text = para.getText().trim().replaceAll(" ","");
try {
//判断该段落是否设置了大纲级别
if (para.getCTP().getPPr().getOutlineLvl() != null) {
return String.valueOf(para.getCTP().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
//判断该段落的样式是否设置了大纲级别
if (doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl() != null) {
return String.valueOf(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
//判断该段落的样式的基础样式是否设置了大纲级别
if (doc.getStyles().getStyle(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal()).getCTStyle().getPPr().getOutlineLvl() != null) {
String styleName = doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal();
return String.valueOf(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
if (para.getStyleID() != null) {
return para.getStyleID();
}
} catch (Exception e) {
}
Pattern pattern = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher = pattern.matcher(text);
Pattern pattern1 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher1 = pattern1.matcher(text);
Pattern pattern2 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher2 = pattern2.matcher(text);
Pattern pattern3 = Pattern.compile("\\([一二三四五六七八九十]+\\)");
Matcher matcher3 = pattern3.matcher(text);
if (StringUtils.isNotEmpty(text) && (text.matches("[一二三四五六七八九十]+、.*") || Constants.SCIENCE_FILE.contains(text))) {
return "1";
}else if (text.matches("^\\d+\\.[^\\d].*")) {
return "1";
}else if (text.matches("^\\d[\\u4e00-\\u9fa5]+$")) {
return "1";
}else if (StringUtils.isNotEmpty(text) && (matcher.find() || matcher1.find() || matcher2.find() || matcher3.find())) {
return "2";
}else if (text.matches("\\d+\\.\\d+.*")) {
return "2";
}else if (text.matches("^\\d+\\.\\d+\\.\\d+.*$")) {
return "3";
}
return titleLvl;
}
/****
* @return
*/
private static DocEntity getDocEntity(XWPFParagraph p, String name, String parnetId) {
DocEntity docEntity = new DocEntity();
docEntity.setName(name);
docEntity.setId(UUID.randomUUID().toString().replaceAll("-", ""));
if (p != null) {
docEntity.setParentId(String.valueOf(p.getIndentationLeft()));
} else {
docEntity.setParentId(parnetId);
}
return docEntity;
}
public static List<String> extractPText(String htmlString) {
List<String> pTextList = new ArrayList<>();
// 使用Jsoup解析HTML字符串
Document doc = Jsoup.parse(htmlString);
// 获取所有p标签
Elements pTags = doc.select("p");
// 遍历p标签,并将文本内容添加到集合中
for (Element pTag : pTags) {
String pText = pTag.text().replaceAll("&nbsp;", "").replaceAll(" ","");
System.out.println(pText);
pTextList.add(pText);
}
return pTextList;
}
public static void formatFile(AiReportScienceFile reportTemplate, String extension, MultipartFile file) throws Exception {
//TODO 文件上传
obsUtil = GetBeanUtil.getApplicationContext().getBean(ObsUtil.class);
//文件路径
byte[] bytes = file.getBytes();
PutObjectResult putObjectResult = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + "." + extension, bytes);
reportTemplate.setFilePathObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult.getObjectKey());
// //预览路径(pdf)
// String content = reportTemplate.getContent();
// byte[] pdfBytes = DocUtil.convertDocHtml2Pdf(content, false);
// PutObjectResult putObjectResult1 = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + ".pdf", pdfBytes);
// reportTemplate.setPreviewObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult1.getObjectKey());
// //封面路径(png)
// byte[] pngBytes = DocUtil.convertDocHtml2Png(content, false);
// PutObjectResult putObjectResult2 = obsUtil.uploadFile(DirEnum.SCIENCE_FILE.getPath() + UUID.randomUUID() + ".png", pngBytes);
// reportTemplate.setCoverObs(Constants.OBS_FILE_PATH_URL_PREFIX_NOS + putObjectResult2.getObjectKey());
}
public static List<String> beautifyContent(String content) throws Exception {
List<String> contentList = new ArrayList<>();
// 解析HTML字符串
Document doc = Jsoup.parse(content);
// 提取所有的p标签、img标签和table标签
Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
Elements elements1 = new Elements();
String hh = "";
// 打印提取结果、去多余空格
for (Element element : elements) {
if(element.toString().contains("<img") || element.toString().contains("<table")){
elements1.add(element);
}else{
if(!element.text().equals("") && !chunshuzi(element.text()) && !element.text().contains("http://www.cnki.net")
&& (!element.text().contains("(c)") && !element.text().contains("china"))){
if(!calculateEnglishRatio(element.text())){
String text1 = element.text().replaceAll(" ","").replaceAll("&nbsp;","");
element.text(text1);
}
elements1.add(element);
}
}
}
//优化段落
Boolean flag = true;
String text2 = "";
Iterator<Element> iterator = elements1.iterator();
while(iterator.hasNext()) {
Element element = iterator.next();
String str = element.toString();
if(str.contains("<p") && str.contains("</p>") && !str.contains("<table")){
element.clearAttributes();
element.attr("style","font-size:12pt;text-indent:2em");
str = element.toString();
}
if(str.contains("<h") && str.contains("</h")){
flag = false;
}
//先拼接正文前数据
if(flag){
hh = hh + element;
contentList.add(str);
}else{
//开始优化段落内容
if(str.contains("<p") && str.contains("</p>") && !str.contains("<table") && !element.text().equals("") && !cankaowenxian(element.text().replaceAll(" ",""))){
String ll = element.text();
if(!calculateEnglishRatio(element.text())){
if(!element.text().endsWith("。") && !isDigit(element.text()) && !element.text().startsWith("关键词")
&& !element.text().startsWith("目录") && !element.text().contains("参考文献")){
text2 = text2 + element.text();
iterator.remove();
}else{
if(!text2.equals("")){
text2 = text2 + element.text();
element.clearAttributes();
element.attr("style","font-size:12pt;text-indent:2em");
element.text(text2);
text2 = "";
hh = hh + element;
contentList.add(element.toString());
}else{
hh = hh + element;
contentList.add(element.toString());
}
}
}else{
hh = hh + element;
contentList.add(element.toString());
}
}else{
hh = hh + element;
contentList.add(element.toString());
}
}
}
return contentList;
}
public static List<String> beautifyContent2(String text) throws Exception {
// 解析HTML字符串
Document doc = Jsoup.parse(text);
// 提取所有的p标签、img标签和table标签
Elements elements = doc.select("p:not(:has(img)),p > img,img,table,h1,h2,h3,h4");
List<String> contentList = new ArrayList<>();
Elements elements1 = new Elements();
String hh = "";
// 打印提取结果、去多余空格
for (Element element : elements) {
if(element.toString().contains("<img") || element.toString().contains("<table")){
elements1.add(element);
}else{
/* if(!element.text().equals("") && !chunshuzi(element.text()) && !element.text().contains("http://www.cnki.net")
&& (!element.text().startsWith("(c)") && !element.text().contains("china"))){*/
if(!element.text().equals("") && !chunshuzi(element.text())
&& (!element.text().startsWith("(c)"))&& (!element.text().startsWith("(C)"))&& (!element.text().startsWith("作者简介"))
&& (!element.text().startsWith("收稿日期"))){
if(!calculateEnglishRatio(element.text())){
String text1 = element.text().replaceAll(" ","").replaceAll("&nbsp;","");
element.text(text1);
}
elements1.add(element);
}
}
}
//优化段落
Boolean flag = true;
String text2 = "";
Iterator<Element> iterator = elements1.iterator();
while(iterator.hasNext()) {
Element element = iterator.next();
String str = element.toString();
if(str.contains("<p") && str.contains("</p>") && !str.contains("<table")){
element.clearAttributes();
element.attr("style","font-size:12pt;text-indent:2em");
str = element.toString();
}
if(str.contains("<h") && str.contains("</h")){
flag = false;
}
//先拼接正文前数据
if(flag){
//如果text2不为空,也就是之前有数据,那么优先加入之前数据
if (StringUtils.isNotEmpty(text2)) {
text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
contentList.add(text2);
text2 = "";
}
hh = hh + element;
contentList.add(str);
}else{
//开始优化段落内容
if(str.contains("<p") && str.contains("</p>") && !str.contains("<table") && !element.text().equals("") && !cankaowenxian(element.text().replaceAll(" ",""))){
String ll = element.text();
if(!calculateEnglishRatio(element.text())){
if(!element.text().endsWith("。") && !isDigit(element.text()) && !element.text().startsWith("关键词")
&& !element.text().startsWith("目录") && !element.text().contains("参考文献") && !element.text().endsWith("页)")
&& !element.text().endsWith("页)")){
text2 = text2 + element.text();
iterator.remove();
}else{
if(!text2.equals("")){
text2 = text2 + element.text();
element.clearAttributes();
element.attr("style","font-size:12pt;text-indent:2em");
element.text(text2);
text2 = "";
hh = hh + element;
contentList.add(element.toString());
}else{
hh = hh + element;
contentList.add(element.toString());
}
}
}else{
//如果text2不为空,也就是之前有数据,那么优先加入之前数据
if (StringUtils.isNotEmpty(text2)) {
text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
contentList.add(text2);
text2 = "";
}
hh = hh + element;
contentList.add(element.toString());
}
}else{
//如果text2不为空,也就是之前有数据,那么优先加入之前数据
if (StringUtils.isNotEmpty(text2)) {
text2 = "<p style=\"font-size:12pt;text-indent:2em\">"+text2+"</p>";
contentList.add(text2);
text2 = "";
}
hh = hh + element;
contentList.add(element.toString());
}
}
}
return contentList;
}
// 判断英文字符的比重是否大于百分之50
public static boolean calculateEnglishRatio(String str) {
int englishCount = 0;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (isEnglish(c)) {
englishCount++;
}
}
double englishRatio = (double) englishCount / str.length();
return englishRatio > 0.5;
}
// 判断字符是否为英文字符
public static boolean isEnglish(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
// 判断英文字符的比重是否大于百分之50
public static boolean isDigit (String str) {
char lastChar = str.charAt(str.length() - 1);
boolean isDigit = Character.isDigit(lastChar);
return isDigit;
}
public static void main(String[] args) {
String aa="[1]张维迎.所有制、治理结构及委托—代理关系:兼评崔之元和周其仁的一些观点[J].经济研究,1996(9):3-15,53.";
System.out.println(cankaowenxian(aa));
}
public static boolean cankaowenxian (String str) {
Pattern pattern = Pattern.compile("^\\[[1234567890]+\\]");
Matcher matcher = pattern.matcher(str);
Pattern pattern1 = Pattern.compile("^\\[[1234567890]+\\]");
Matcher matcher1 = pattern1.matcher(str);
if(matcher.find() || matcher1.find()){
return true;
}else{
return false;
}
}
public static boolean chunshuzi (String str) {
str = str.replaceAll("·","").replaceAll(" ","");
Pattern pattern = Pattern.compile("^\\d+$");
Matcher matcher = pattern.matcher(str);
return matcher.find();
}
/*
public static void sendKafka(List<AiReportScienceFileMaterial> list, String fileId, Integer fileType, String origin, String status){
streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
log.debug("期刊论文素材推送kafka开始======");
for(AiReportScienceFileMaterial obj:list){
if(StringUtils.isNotEmpty(obj.getContent()) && obj.getContent().contains("<p") && obj.getContent().contains("</p>")
&& countChineseCharacters(Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())))<36){
continue;
}
//推送kafka
JSONObject jo = new JSONObject();
jo.put("origin", origin);
jo.put("fileId", fileId);
jo.put("status", status);
jo.put("fileType", getFileType(fileType));
jo.put("textId", obj.getId());
jo.put("text", Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())));
jo.put("textType", getTextType(obj.getContentType()));
streamBridge.send("science_file", jo);
log.debug("推送成功,段落id:"+obj.getId());
}
}
public static void sendKafka2(List<AiReportScienceFileMaterial> list, String fileId, Integer fileType,String origin,String status){
streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
log.debug("期刊论文素材推送kafka开始======");
for(AiReportScienceFileMaterial obj:list){
if(StringUtils.isNotEmpty(obj.getContent()) && obj.getContent().contains("<p") && obj.getContent().contains("</p>")
&& countChineseCharacters(Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())))<36){
continue;
}
//推送kafka
JSONObject jo = new JSONObject();
jo.put("origin", origin);
jo.put("fileId", fileId);
jo.put("status", status);
jo.put("fileType", getFileType(fileType));
jo.put("textId", obj.getId());
jo.put("text", Utility.TransferHTML2Text(Utility.RemoveUselessHTMLTag(obj.getContent())));
jo.put("textType", getTextType(obj.getContentType()));
streamBridge.send("science_file_1219", jo);
log.debug("推送成功,段落id:"+obj.getId());
}
}
*/
/**
* 推送审核后需要删除的textId
*/
/*
public static void sendKafka(List<String> contentIds){
try {
streamBridge = GetBeanUtil.getApplicationContext().getBean(StreamBridge.class);
log.debug("期刊论文素材审核删除段落开始推送======");
for (String contentId : contentIds) {
if (StringUtils.isNotBlank(contentId)) {
//推送kafka
JSONObject jo = new JSONObject();
jo.put("textId", contentId);
streamBridge.send("delete_science_material", jo);
log.debug("推送成功,段落id:{}", contentId);
}
}
} catch (BeansException e) {
log.error("推送需删除的数据失败:{}", e.getMessage());
}
}
*/
public static int countChineseCharacters(String str) {
Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]");
Matcher matcher = pattern.matcher(str);
int count = 0;
while (matcher.find()) {
count++;
}
return count;
}
}
package com.zzsn.knowbase.vo;
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;
import org.springframework.format.annotation.DateTimeFormat;
import java.util.Date;
@Data
public class AiReportScienceFileVo {
/**
* 主键id
*/
private String id;
/**
* 资料内容
*/
private String content;
/**
* 资料内容格式(text/html/url/img)
*/
private String contentType;
/**
* 资料标题
*/
private String title;
/**
* 资料标题
*/
private String keyWords;
/**
* 年份
*/
private String year;
/**
* 作者
*/
private String author;
/**
* 来源
*/
private String origin;
/**
* 发布时间
*/
private String publishDate;
/**
* 资料类型(0:期刊、1:博士论文 2:硕士论文、3:图书)
*/
private Integer dataType;
/**
* 源文件地址
*/
private String zipFileUrl;
/**
* 状态(0:未审核、1:审核不通过 2:审核通过)
*/
private Integer status;
/**
* 审核人
*/
private String auditBy;
/**
* 审核时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
private Date auditTime;
/**
* 创建人
*/
private String createBy;
/**
* 创建时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
private Date createTime;
/**
* 更新人
*/
private String updateBy;
/**
* 更新时间
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
private Date updateTime;
/**
* 是否删除(0否 1是)
*/
private String deleted;
/**
* 文件路径
*/
private String filePathObs;
/**
* 封面路径
*/
private String coverObs;
/**
* 预览路径
*/
private String previewObs;
/**
* 搜索关键词
*/
private String searchWords;
/**
* 搜索字段类型 title:标题;content:内容
*/
private String searchType;
private String auditTimeStart;
private String auditTimeEnd;
private String pubStartTime;
private String pubEndTime;
}
package com.zzsn.knowbase.vo;
import lombok.Data;
import java.util.List;
@Data
public class DocEntity {
public String name;
public String content;
public List<DocEntity> section;
public String parentId;
public String id;
}
package com.zzsn.knowbase.vo;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.zzsn.knowbase.constant.CommonConstant;
import lombok.Data;
import java.io.Serializable;
/**
* 接口返回数据格式
* @author scott
* @email jeecgos@163.com
* @date 2019年1月19日
*/
@Data
public class Result<T> implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 成功标志
*/
private boolean success = true;
/**
* 返回处理消息
*/
private String message = "操作成功!";
/**
* 返回代码
*/
private Integer code = 0;
/**
* 返回数据对象 data
*/
private T result;
/**
* 时间戳
*/
private long timestamp = System.currentTimeMillis();
/**
* python接口返回
*/
private String handleMsg;
/**
* python接口返回状态
*/
private boolean isHandleSuccess;
/**
* python接口返回状态
*/
private String logs;
/**
* 返回数据对象 data
*/
private T resultData;
public Result() {
}
public Result<T> success(String message) {
this.message = message;
this.code = CommonConstant.SC_OK_200;
this.success = true;
return this;
}
@Deprecated
public static Result<Object> ok() {
Result<Object> r = new Result<Object>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setMessage("成功");
return r;
}
@Deprecated
public static Result<Object> ok(String msg) {
Result<Object> r = new Result<Object>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setMessage(msg);
return r;
}
@Deprecated
public static Result<Object> ok(Object data) {
Result<Object> r = new Result<Object>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setResult(data);
return r;
}
public static<T> Result<T> OK() {
Result<T> r = new Result<T>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setMessage("成功");
return r;
}
public static<T> Result<T> OK(T data) {
Result<T> r = new Result<T>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setResult(data);
return r;
}
public static<T> Result<T> OK(String msg, T data) {
Result<T> r = new Result<T>();
r.setSuccess(true);
r.setCode(CommonConstant.SC_OK_200);
r.setMessage(msg);
r.setResult(data);
return r;
}
public static Result<Object> error(String msg) {
return error(CommonConstant.SC_INTERNAL_SERVER_ERROR_500, msg);
}
public static Result<Object> error(int code, String msg) {
Result<Object> r = new Result<Object>();
r.setCode(code);
r.setMessage(msg);
r.setSuccess(false);
return r;
}
public Result<T> error500(String message) {
this.message = message;
this.code = CommonConstant.SC_INTERNAL_SERVER_ERROR_500;
this.success = false;
return this;
}
/**
* 无权限访问返回结果
*/
public static Result<Object> noauth(String msg) {
return error(CommonConstant.SC_JEECG_NO_AUTHZ, msg);
}
@JsonIgnore
private String onlTable;
}
\ No newline at end of file
server:
port: 9088
spring:
datasource:
url: jdbc:mysql://localhost:3306/know?serverTimezone=UTC&useUnicode=true&characterEncoding=utf-8&AllowPublicKeyRetrieval=True
username: root
password: root
redis:
database: 0
host: localhost
lettuce:
pool:
max-active: 8 #最大连接数据库连接数,设 0 为没有限制
max-idle: 8 #最大等待连接中的数量,设 0 为没有限制
max-wait: -1ms #最大建立连接等待时间。如果超过此时间将接到异常。设为-1表示无限制。
min-idle: 0 #最小等待连接中的数量,设 0 为没有限制
shutdown-timeout: 100ms
port: 6379
mybatis-plus:
mapper-locations: classpath*:com/zzsn/knowbase/mapper/xml/*Mapper.xml
configuration:
log-impl: org.apache.ibatis.logging.stdout.StdOutImpl
map-underscore-to-camel-case: true
\ No newline at end of file
-- clb_project.ai_report_science_file definition
CREATE TABLE `ai_report_science_file` (
`id` varchar(36) NOT NULL COMMENT '主键',
`content` mediumtext COMMENT '文档内容',
`content_type` varchar(50) DEFAULT NULL COMMENT '资料内容格式(text/pdf/word)',
`file_path_obs` varchar(500) DEFAULT NULL COMMENT '文件地址',
`title` varchar(500) DEFAULT NULL COMMENT '文档标题',
`year` int(11) DEFAULT NULL COMMENT '所属年份',
`author` varchar(256) DEFAULT NULL COMMENT '作者',
`origin` varchar(255) DEFAULT NULL COMMENT '来源',
`publish_date` varchar(100) DEFAULT NULL COMMENT '发布时间',
`data_type` int(11) DEFAULT NULL COMMENT '资料类型(0:期刊、1:论文2:图书 3:研报)',
`zip_file_url` varchar(500) DEFAULT NULL COMMENT '资料压缩包文件地址',
`degree_awarding_unit` varchar(255) DEFAULT NULL COMMENT '学位授予单位',
`degree_awarding_time` varchar(255) DEFAULT NULL COMMENT '学位授予时间',
`status` tinyint(4) DEFAULT NULL COMMENT '状态(0:未审核、1:审核不通过 2:审核通过)',
`audit_by` varchar(100) DEFAULT NULL COMMENT '审核人',
`audit_time` datetime DEFAULT NULL COMMENT '审核时间',
`create_by` varchar(50) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '创建人',
`create_time` datetime DEFAULT NULL COMMENT '创建日期',
`update_by` varchar(50) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '更新人',
`update_time` datetime DEFAULT NULL COMMENT '更新日期',
`deleted` varchar(10) DEFAULT NULL COMMENT '是否删除(0否 1是)',
`cover_obs` varchar(500) DEFAULT NULL COMMENT '封面路径',
`preview_obs` varchar(500) DEFAULT NULL COMMENT '预览路径',
`key_words` varchar(255) DEFAULT NULL COMMENT '关键词',
`check_by` varchar(100) DEFAULT NULL COMMENT '校验人',
`check_time` datetime DEFAULT NULL COMMENT '校验时间',
PRIMARY KEY (`id`) USING BTREE,
KEY `index_title` (`title`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='基本信息表';
-- clb_project.ai_report_science_file_material definition
CREATE TABLE `ai_report_science_file_material` (
`id` varchar(36) NOT NULL COMMENT '主键',
`file_id` varchar(36) DEFAULT NULL COMMENT '文档id',
`content` mediumtext COMMENT '层级内容',
`content_type` varchar(50) DEFAULT NULL COMMENT '内容类型(chaper:章节标题 chaperContent:章节内容 paragraph:段落 标题 paragraphContent:段落内容)',
`level` varchar(50) DEFAULT NULL COMMENT '层级',
`parent` varchar(50) DEFAULT NULL COMMENT '父级id',
`sort` int(11) DEFAULT NULL COMMENT '排序',
`create_by` varchar(50) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '创建人',
`create_time` datetime DEFAULT NULL COMMENT '创建日期',
`update_by` varchar(50) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '更新人',
`update_time` datetime DEFAULT NULL COMMENT '更新日期',
`deleted` varchar(10) DEFAULT NULL COMMENT '是否删除(0否 1是)',
PRIMARY KEY (`id`) USING BTREE,
KEY `idx_parent` (`parent`),
KEY `idx_fileId_sort` (`file_id`,`sort`,`content_type`),
KEY `idx_parent_fileId_sort` (`parent`,`file_id`,`sort`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='拆分结构内容表';
\ No newline at end of file
package com.zzsn.knowbase;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
@SpringBootTest
class KnowBaseApplicationTests {
@Test
void contextLoads() {
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论