|
@@ -1,24 +1,20 @@
|
|
|
package com.smppw.modaq.application.components.report.parser.pdf;
|
|
|
|
|
|
import cn.hutool.core.collection.ListUtil;
|
|
|
-import cn.hutool.core.exceptions.ExceptionUtil;
|
|
|
-import cn.hutool.core.map.MapUtil;
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
-import cn.hutool.http.HttpUtil;
|
|
|
-import cn.hutool.json.JSONObject;
|
|
|
-import cn.hutool.json.JSONUtil;
|
|
|
import com.smppw.modaq.application.components.CustomPDFTextStripper;
|
|
|
import com.smppw.modaq.application.components.ReportParseUtils;
|
|
|
import com.smppw.modaq.application.components.report.parser.AbstractReportParser;
|
|
|
import com.smppw.modaq.common.enums.ReportParseStatus;
|
|
|
import com.smppw.modaq.common.enums.ReportType;
|
|
|
import com.smppw.modaq.common.exception.ReportParseException;
|
|
|
+import com.smppw.modaq.domain.dto.report.ReportData;
|
|
|
+import com.smppw.modaq.domain.dto.report.ReportParserParams;
|
|
|
import com.smppw.modaq.domain.dto.report.*;
|
|
|
import com.smppw.modaq.domain.mapper.EmailFieldMappingMapper;
|
|
|
import org.apache.pdfbox.Loader;
|
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
-import org.springframework.beans.factory.annotation.Value;
|
|
|
import technology.tabula.CustomObjectExtractor;
|
|
|
import technology.tabula.Page;
|
|
|
import technology.tabula.PageIterator;
|
|
@@ -42,14 +38,14 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
*/
|
|
|
protected List<String> textList;
|
|
|
|
|
|
- @Value("${email.report.ai-parser-url}")
|
|
|
- private String aiParserUrl;
|
|
|
+// @Value("${email.report.ai-parser-url}")
|
|
|
+// private String aiParserUrl;
|
|
|
|
|
|
- protected String aiFileId;
|
|
|
-
|
|
|
- protected String aiParserContent;
|
|
|
-
|
|
|
- protected Boolean aiParse = false;
|
|
|
+// protected String aiFileId;
|
|
|
+//
|
|
|
+// protected String aiParserContent;
|
|
|
+//
|
|
|
+// protected Boolean aiParse = false;
|
|
|
|
|
|
public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
|
|
|
super(fieldMappingMapper);
|
|
@@ -68,9 +64,9 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
String text = stripper.getText(document);
|
|
|
this.textList = StrUtil.split(text, System.lineSeparator());
|
|
|
this.textList.removeIf(StrUtil::isBlank);
|
|
|
-// if (this.textList.isEmpty()) {
|
|
|
-// throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
|
|
|
-// }
|
|
|
+ if (this.textList.isEmpty()) {
|
|
|
+ throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
|
|
|
+ }
|
|
|
// 解析所有表格(单元格字符去水印)
|
|
|
List<Table> tables = ListUtil.list(true);
|
|
|
SpreadsheetExtractionAlgorithm spreadsheetExtractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
@@ -93,45 +89,36 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
tables.add(table);
|
|
|
}
|
|
|
}
|
|
|
- } else if (params.getReportType() == ReportType.LETTER) {
|
|
|
- this.aiParse = true;
|
|
|
- Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
- paramsMap.put("filepath", filepath);
|
|
|
- paramsMap.put("file_id", params.getAiFileId());
|
|
|
- String body = null;
|
|
|
- try {
|
|
|
- body = HttpUtil.get(this.aiParserUrl, paramsMap);
|
|
|
- JSONObject jsonObject = JSONUtil.parseObj(body);
|
|
|
- this.aiFileId = MapUtil.getStr(jsonObject, "file_id");
|
|
|
- String content = StrUtil.split(jsonObject.getStr("content"), "```").get(1);
|
|
|
- this.aiParserContent = "{" + StrUtil.subAfter(content, "{", false) + "}";
|
|
|
- } catch (Exception e) {
|
|
|
- this.logger.warn("{} ai解析失败,解析结果{},错误原因:{}",
|
|
|
- filename, body, ExceptionUtil.stacktraceToString(e));
|
|
|
- }
|
|
|
+ } else {
|
|
|
+// this.aiParse = true;
|
|
|
+// Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
+// paramsMap.put("filepath", filepath);
|
|
|
+// paramsMap.put("file_id", params.getAiFileId());
|
|
|
+// String body = null;
|
|
|
+// try {
|
|
|
+// body = HttpUtil.get(this.aiParserUrl, paramsMap);
|
|
|
+// JSONObject jsonObject = JSONUtil.parseObj(body);
|
|
|
+// this.aiFileId = MapUtil.getStr(jsonObject, "file_id");
|
|
|
+// String content = StrUtil.split(jsonObject.getStr("content"), "```").get(1);
|
|
|
+// this.aiParserContent = "{" + StrUtil.subAfter(content, "{", false) + "}";
|
|
|
+// } catch (Exception e) {
|
|
|
+// this.logger.warn("{} ai解析失败,解析结果{},错误原因:{}", filename, body, ExceptionUtil.stacktraceToString(e));
|
|
|
+// }
|
|
|
+ throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
|
|
|
}
|
|
|
i++;
|
|
|
}
|
|
|
- if (tables.isEmpty() && StrUtil.isBlank(this.aiParserContent)) {
|
|
|
+ if (tables.isEmpty()) {
|
|
|
throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
|
|
|
}
|
|
|
this.initTableInfo(tables);
|
|
|
}
|
|
|
- try {
|
|
|
- // 报告基本信息
|
|
|
- ReportBaseInfoDTO reportInfo = this.buildReportInfo(params);
|
|
|
- // 解析报告中主体基金的基本信息
|
|
|
- ReportFundInfoDTO reportFundInfo = this.buildFundInfo(params);
|
|
|
- // 解析其他表格信息并且设置结果字段
|
|
|
- T reportData = this.parseExtInfoAndSetData(reportInfo, reportFundInfo);
|
|
|
- // 数据清洗后返回
|
|
|
- this.cleaningReportData(reportData);
|
|
|
- return reportData;
|
|
|
- } catch (ReportParseException e) {
|
|
|
- throw e;
|
|
|
- } catch (Exception e) {
|
|
|
- throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
|
|
|
+ T reportData = this.buildReportData(params, filename);
|
|
|
+ if (!reportData.wasSuccessful()) {
|
|
|
+ // 抛出异常方便ai解析
|
|
|
+ throw new ReportParseException(ReportParseStatus.PARSE_CORE_INFO_FAIL, filename);
|
|
|
}
|
|
|
+ return reportData;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -141,23 +128,6 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
*/
|
|
|
protected abstract void initTableInfo(List<Table> tables);
|
|
|
|
|
|
- /**
|
|
|
- * 绑定基金基本信息
|
|
|
- *
|
|
|
- * @param params /
|
|
|
- * @return /
|
|
|
- */
|
|
|
- protected abstract ReportFundInfoDTO buildFundInfo(ReportParserParams params);
|
|
|
-
|
|
|
- /**
|
|
|
- * 解析报告的其他信息并设置到对象中
|
|
|
- *
|
|
|
- * @param reportInfo 报告基本信息
|
|
|
- * @param fundInfo 报告中基金基本信息
|
|
|
- * @return /
|
|
|
- */
|
|
|
- protected abstract T parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo,
|
|
|
- ReportFundInfoDTO fundInfo);
|
|
|
|
|
|
@Override
|
|
|
protected void cleaningReportData(T reportData) {
|
|
@@ -169,9 +139,9 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
super.init();
|
|
|
// 先初始化为null
|
|
|
this.textList = null;
|
|
|
- this.aiFileId = null;
|
|
|
- this.aiParserContent = null;
|
|
|
- this.aiParse = false;
|
|
|
+// this.aiFileId = null;
|
|
|
+// this.aiParserContent = null;
|
|
|
+// this.aiParse = false;
|
|
|
}
|
|
|
|
|
|
/**
|