|
@@ -5,11 +5,11 @@ import cn.hutool.core.exceptions.ExceptionUtil;
|
|
import cn.hutool.core.map.MapUtil;
|
|
import cn.hutool.core.map.MapUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
import com.simuwang.base.common.conts.Constants;
|
|
import com.simuwang.base.common.conts.Constants;
|
|
-import com.simuwang.base.common.enums.ReportType;
|
|
|
|
import com.simuwang.base.common.exception.ReportParseException;
|
|
import com.simuwang.base.common.exception.ReportParseException;
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
import com.simuwang.base.pojo.dto.report.*;
|
|
import com.simuwang.base.pojo.dto.report.*;
|
|
import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
|
+import com.simuwang.daq.components.ReportParseUtils;
|
|
import com.simuwang.daq.components.report.parser.AbstractReportParser;
|
|
import com.simuwang.daq.components.report.parser.AbstractReportParser;
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
@@ -21,13 +21,10 @@ import technology.tabula.Table;
|
|
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
-import java.util.Calendar;
|
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
import java.util.Objects;
|
|
import java.util.Objects;
|
|
import java.util.function.Function;
|
|
import java.util.function.Function;
|
|
-import java.util.regex.Matcher;
|
|
|
|
-import java.util.regex.Pattern;
|
|
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -68,7 +65,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
|
|
throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
|
|
}
|
|
}
|
|
// 报告名称和类型一般在第一第二行
|
|
// 报告名称和类型一般在第一第二行
|
|
- if (this.matchReportType(this.textList.get(0)) == null && this.matchReportType(this.textList.get(1)) == null) {
|
|
|
|
|
|
+ if (ReportParseUtils.matchReportType(this.textList.get(0)) == null && ReportParseUtils.matchReportType(this.textList.get(1)) == null) {
|
|
throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
|
|
throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
|
|
}
|
|
}
|
|
// 解析所有表格(单元格字符去水印)
|
|
// 解析所有表格(单元格字符去水印)
|
|
@@ -158,8 +155,8 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
String reportName = params.getFilename();
|
|
String reportName = params.getFilename();
|
|
ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
|
|
ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
|
|
reportInfo.setReportName(reportName);
|
|
reportInfo.setReportName(reportName);
|
|
- reportInfo.setReportType(this.matchReportType(reportName));
|
|
|
|
- reportInfo.setReportDate(this.matchReportDate(reportName));
|
|
|
|
|
|
+ reportInfo.setReportType(ReportParseUtils.matchReportType(reportName));
|
|
|
|
+ reportInfo.setReportDate(ReportParseUtils.matchReportDate(reportName));
|
|
return reportInfo;
|
|
return reportInfo;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -179,7 +176,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
List<DTO> dtos = tables.stream().filter(Objects::nonNull)
|
|
List<DTO> dtos = tables.stream().filter(Objects::nonNull)
|
|
.map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
|
|
.map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
|
|
// 分级基金匹配
|
|
// 分级基金匹配
|
|
- List<String> levels = this.matchTieredFund(String.join(",", this.textList));
|
|
|
|
|
|
+ List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
|
|
levels.add(0, "母基金");
|
|
levels.add(0, "母基金");
|
|
for (int i = 0; i < dtos.size(); i++) {
|
|
for (int i = 0; i < dtos.size(); i++) {
|
|
if (levels.size() <= i) {
|
|
if (levels.size() <= i) {
|
|
@@ -213,118 +210,118 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
|
|
- /**
|
|
|
|
- * 匹配分级基金名称
|
|
|
|
- *
|
|
|
|
- * @param text 文本内容
|
|
|
|
- * @return /
|
|
|
|
- */
|
|
|
|
- protected List<String> matchTieredFund(String text) {
|
|
|
|
- List<String> matches = ListUtil.list(false);
|
|
|
|
- if (StrUtil.isBlank(text)) {
|
|
|
|
- return matches;
|
|
|
|
- }
|
|
|
|
- // 使用正则表达式查找匹配项
|
|
|
|
- Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
|
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
|
- // 收集所有匹配项
|
|
|
|
- while (matcher.find()) {
|
|
|
|
- matches.add(matcher.group());
|
|
|
|
- }
|
|
|
|
- // 提取字母并按字母顺序排序
|
|
|
|
- return matches.stream()
|
|
|
|
- .map(s -> s.replaceAll("[^A-F]", ""))
|
|
|
|
- .distinct()
|
|
|
|
- .sorted()
|
|
|
|
- .map(letter -> letter + "级")
|
|
|
|
- .collect(Collectors.toList());
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- /**
|
|
|
|
- * 匹配报告日期
|
|
|
|
- *
|
|
|
|
- * @param string 文本内容
|
|
|
|
- * @return 报告日期
|
|
|
|
- */
|
|
|
|
- private String matchReportDate(String string) {
|
|
|
|
- if (string == null) {
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
- // 编译正则表达式模式
|
|
|
|
- Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
|
|
|
|
- Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
|
|
|
|
- Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
|
|
|
|
- Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
|
|
|
|
- Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}"); // 20231231
|
|
|
|
- Pattern pat6 = Pattern.compile("(2\\d{3})年度"); // 2023年度
|
|
|
|
- // 创建Matcher对象
|
|
|
|
- Matcher matcher1 = pat1.matcher(string);
|
|
|
|
- Matcher matcher2 = pat2.matcher(string);
|
|
|
|
- Matcher matcher3 = pat3.matcher(string);
|
|
|
|
- Matcher matcher4 = pat4.matcher(string);
|
|
|
|
- Matcher matcher5 = pat5.matcher(string);
|
|
|
|
- Matcher matcher6 = pat6.matcher(string);
|
|
|
|
- // 尝试匹配
|
|
|
|
- if (matcher1.find()) {
|
|
|
|
- String year = matcher1.group(1);
|
|
|
|
- String quarter = matcher1.group(2);
|
|
|
|
- return switch (quarter) {
|
|
|
|
- case "一", "1" -> year + "-03-31";
|
|
|
|
- case "二", "2" -> year + "-06-30";
|
|
|
|
- case "三", "3" -> year + "-09-30";
|
|
|
|
- case "四", "4" -> year + "-12-31";
|
|
|
|
- default -> null;
|
|
|
|
- };
|
|
|
|
- } else if (matcher2.find()) {
|
|
|
|
- return matcher2.group();
|
|
|
|
- } else if (matcher5.find()) {
|
|
|
|
- return matcher5.group();
|
|
|
|
- } else if (matcher3.find()) {
|
|
|
|
- return matcher3.group(1) + "-12-31";
|
|
|
|
- } else if (matcher6.find()) {
|
|
|
|
- return matcher6.group(1) + "-12-31";
|
|
|
|
- } else if (matcher4.find()) {
|
|
|
|
- String year = matcher4.group(1);
|
|
|
|
- String month = matcher4.group(2);
|
|
|
|
- int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
|
|
|
|
- return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
|
|
|
|
- } else {
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- /**
|
|
|
|
- * 匹配报告类型,如“季度”、“年度”
|
|
|
|
- *
|
|
|
|
- * @param string 输入字符串
|
|
|
|
- * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
|
|
|
|
- */
|
|
|
|
- private String matchReportType(String string) {
|
|
|
|
- if (string == null) {
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
- // 所有报告的正则识别方式
|
|
|
|
- String patterns = ReportType.getAllPatterns();
|
|
|
|
- // 编译正则表达式模式
|
|
|
|
- Pattern pattern = Pattern.compile(patterns);
|
|
|
|
- // 创建Matcher对象
|
|
|
|
- Matcher matcher = pattern.matcher(string);
|
|
|
|
- // 尝试匹配
|
|
|
|
- if (matcher.find()) {
|
|
|
|
- return matcher.group();
|
|
|
|
- } else {
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- private int getLastDayOfMonth(int year, int month) {
|
|
|
|
- Calendar calendar = Calendar.getInstance();
|
|
|
|
- calendar.set(Calendar.YEAR, year);
|
|
|
|
- calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
|
|
|
|
- return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- private String padZero(String number) {
|
|
|
|
- return String.format("%02d", Integer.parseInt(number));
|
|
|
|
- }
|
|
|
|
|
|
+// /**
|
|
|
|
+// * 匹配分级基金名称
|
|
|
|
+// *
|
|
|
|
+// * @param text 文本内容
|
|
|
|
+// * @return /
|
|
|
|
+// */
|
|
|
|
+// protected List<String> matchTieredFund(String text) {
|
|
|
|
+// List<String> matches = ListUtil.list(false);
|
|
|
|
+// if (StrUtil.isBlank(text)) {
|
|
|
|
+// return matches;
|
|
|
|
+// }
|
|
|
|
+// // 使用正则表达式查找匹配项
|
|
|
|
+// Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
|
|
|
|
+// Matcher matcher = pattern.matcher(text);
|
|
|
|
+// // 收集所有匹配项
|
|
|
|
+// while (matcher.find()) {
|
|
|
|
+// matches.add(matcher.group());
|
|
|
|
+// }
|
|
|
|
+// // 提取字母并按字母顺序排序
|
|
|
|
+// return matches.stream()
|
|
|
|
+// .map(s -> s.replaceAll("[^A-F]", ""))
|
|
|
|
+// .distinct()
|
|
|
|
+// .sorted()
|
|
|
|
+// .map(letter -> letter + "级")
|
|
|
|
+// .collect(Collectors.toList());
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+// /**
|
|
|
|
+// * 匹配报告日期
|
|
|
|
+// *
|
|
|
|
+// * @param string 文本内容
|
|
|
|
+// * @return 报告日期
|
|
|
|
+// */
|
|
|
|
+// private String matchReportDate(String string) {
|
|
|
|
+// if (string == null) {
|
|
|
|
+// return null;
|
|
|
|
+// }
|
|
|
|
+// // 编译正则表达式模式
|
|
|
|
+// Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
|
|
|
|
+// Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
|
|
|
|
+// Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
|
|
|
|
+// Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
|
|
|
|
+// Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}"); // 20231231
|
|
|
|
+// Pattern pat6 = Pattern.compile("(2\\d{3})年度"); // 2023年度
|
|
|
|
+// // 创建Matcher对象
|
|
|
|
+// Matcher matcher1 = pat1.matcher(string);
|
|
|
|
+// Matcher matcher2 = pat2.matcher(string);
|
|
|
|
+// Matcher matcher3 = pat3.matcher(string);
|
|
|
|
+// Matcher matcher4 = pat4.matcher(string);
|
|
|
|
+// Matcher matcher5 = pat5.matcher(string);
|
|
|
|
+// Matcher matcher6 = pat6.matcher(string);
|
|
|
|
+// // 尝试匹配
|
|
|
|
+// if (matcher1.find()) {
|
|
|
|
+// String year = matcher1.group(1);
|
|
|
|
+// String quarter = matcher1.group(2);
|
|
|
|
+// return switch (quarter) {
|
|
|
|
+// case "一", "1" -> year + "-03-31";
|
|
|
|
+// case "二", "2" -> year + "-06-30";
|
|
|
|
+// case "三", "3" -> year + "-09-30";
|
|
|
|
+// case "四", "4" -> year + "-12-31";
|
|
|
|
+// default -> null;
|
|
|
|
+// };
|
|
|
|
+// } else if (matcher2.find()) {
|
|
|
|
+// return matcher2.group();
|
|
|
|
+// } else if (matcher5.find()) {
|
|
|
|
+// return matcher5.group();
|
|
|
|
+// } else if (matcher3.find()) {
|
|
|
|
+// return matcher3.group(1) + "-12-31";
|
|
|
|
+// } else if (matcher6.find()) {
|
|
|
|
+// return matcher6.group(1) + "-12-31";
|
|
|
|
+// } else if (matcher4.find()) {
|
|
|
|
+// String year = matcher4.group(1);
|
|
|
|
+// String month = matcher4.group(2);
|
|
|
|
+// int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
|
|
|
|
+// return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
|
|
|
|
+// } else {
|
|
|
|
+// return null;
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+// /**
|
|
|
|
+// * 匹配报告类型,如“季度”、“年度”
|
|
|
|
+// *
|
|
|
|
+// * @param string 输入字符串
|
|
|
|
+// * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
|
|
|
|
+// */
|
|
|
|
+// private String matchReportType(String string) {
|
|
|
|
+// if (string == null) {
|
|
|
|
+// return null;
|
|
|
|
+// }
|
|
|
|
+// // 所有报告的正则识别方式
|
|
|
|
+// String patterns = ReportType.getAllPatterns();
|
|
|
|
+// // 编译正则表达式模式
|
|
|
|
+// Pattern pattern = Pattern.compile(patterns);
|
|
|
|
+// // 创建Matcher对象
|
|
|
|
+// Matcher matcher = pattern.matcher(string);
|
|
|
|
+// // 尝试匹配
|
|
|
|
+// if (matcher.find()) {
|
|
|
|
+// return matcher.group();
|
|
|
|
+// } else {
|
|
|
|
+// return null;
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+// private int getLastDayOfMonth(int year, int month) {
|
|
|
|
+// Calendar calendar = Calendar.getInstance();
|
|
|
|
+// calendar.set(Calendar.YEAR, year);
|
|
|
|
+// calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
|
|
|
|
+// return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+// private String padZero(String number) {
|
|
|
|
+// return String.format("%02d", Integer.parseInt(number));
|
|
|
|
+// }
|
|
}
|
|
}
|