|
@@ -0,0 +1,659 @@
|
|
|
+package com.simuwang.daq.service;
|
|
|
+
|
|
|
+import cn.hutool.core.collection.CollUtil;
|
|
|
+import cn.hutool.core.collection.ListUtil;
|
|
|
+import cn.hutool.core.exceptions.ExceptionUtil;
|
|
|
+import cn.hutool.core.text.StrBuilder;
|
|
|
+import cn.hutool.core.util.StrUtil;
|
|
|
+import com.simuwang.base.common.enums.WaterMarkEnum;
|
|
|
+import com.simuwang.base.common.util.ExcelUtil;
|
|
|
+import com.simuwang.base.pojo.dto.ValuationPdfTransformToExcelDTO;
|
|
|
+import org.apache.commons.io.FileUtils;
|
|
|
+import org.apache.pdfbox.Loader;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+import org.apache.poi.ss.usermodel.Cell;
|
|
|
+import org.apache.poi.ss.usermodel.Row;
|
|
|
+import org.apache.poi.ss.usermodel.Sheet;
|
|
|
+import org.apache.poi.ss.usermodel.Workbook;
|
|
|
+import org.apache.poi.ss.util.CellRangeAddress;
|
|
|
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
|
+import org.slf4j.Logger;
|
|
|
+import org.slf4j.LoggerFactory;
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
+import org.springframework.stereotype.Component;
|
|
|
+import org.springframework.web.multipart.MultipartFile;
|
|
|
+import technology.tabula.*;
|
|
|
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
+
|
|
|
+import java.io.File;
|
|
|
+import java.io.IOException;
|
|
|
+import java.io.OutputStream;
|
|
|
+import java.nio.file.Files;
|
|
|
+import java.nio.file.Paths;
|
|
|
+import java.util.Arrays;
|
|
|
+import java.util.List;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+@Component
|
|
|
+public class PdfToExcelService {
|
|
|
+
|
|
|
+ private static final Logger log = LoggerFactory.getLogger(PdfToExcelService.class);
|
|
|
+
|
|
|
+ @Value("${email.file.path}")
|
|
|
+ private String path;
|
|
|
+
|
|
|
+ private static final List<String> EXTRA_FIELD_LIST = ListUtil.toList("成本", "市值", "估值增值");
|
|
|
+ private static final List<String> NUMBER_FIELD_LIST = ListUtil.toList("数量", "单位成本", "成本", "成本占净值",
|
|
|
+ "成本占净值%", "成本占净值", "市价", "市值", "市值占净值比", "市值占净值比%");
|
|
|
+ private static final List<String> TITLE_FIELD_LIST = ListUtil.toList("单位净值:", "位净值:", "估值日期:", "值日期:", "单位:",
|
|
|
+ "科目代码", "科目名", "数量", "单位成本", "位成本", "成本", "成本占净值%", "本币", "十亿千百十",
|
|
|
+ "成本占净值", "市价", "市值", "值", "市值占净值%", "市值占净", "估值增值", "停牌信息", "权益信息");
|
|
|
+ public static final List<String> TOTAL_MARKET_FIELD = ListUtil.toList("资合计", "资类合计:", "资类合计:", "资净值", "基金资净值:",
|
|
|
+ "基金资净值:", "资产资净值", "资资净值:", "资资净值:", "信托资净值:", "信托资净值:");
|
|
|
+
|
|
|
+ /**
|
|
|
+ * PDF类型的估值表文件转为excel类型
|
|
|
+ *
|
|
|
+ * @param fileList 待转换的文件
|
|
|
+ * @return 转换为excel类型的文件
|
|
|
+ */
|
|
|
+ public List<ValuationPdfTransformToExcelDTO> pdfToExcel(List<File> fileList) {
|
|
|
+ if (fileList == null) {
|
|
|
+ return CollUtil.newArrayList();
|
|
|
+ }
|
|
|
+ List<File> pdfFileList = fileList.stream()
|
|
|
+ .filter(e -> ExcelUtil.isPdf(e.getName())).collect(Collectors.toList());
|
|
|
+ if (CollUtil.isEmpty(pdfFileList)) {
|
|
|
+ return CollUtil.newArrayList();
|
|
|
+ }
|
|
|
+ List<ValuationPdfTransformToExcelDTO> pdfToExcelDTOList = CollUtil.newArrayList();
|
|
|
+ String excelUploadDir = path + File.separator + "valuation_table_excel" + File.separator;
|
|
|
+ for (File multipartFile : pdfFileList) {
|
|
|
+ String excelFilePath = excelUploadDir + multipartFile.getName().replace(".pdf", "") + ".xlsx";
|
|
|
+ File savefile = new File(excelFilePath);
|
|
|
+ if (!savefile.exists()) {
|
|
|
+ if (!savefile.getParentFile().exists()) {
|
|
|
+ savefile.getParentFile().mkdirs();
|
|
|
+ savefile.getParentFile().setExecutable(true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ try (OutputStream outputStream = Files.newOutputStream(Paths.get(excelFilePath))) {
|
|
|
+ PDDocument document = Loader.loadPDF(multipartFile);
|
|
|
+ //采用pdfBox读取PDF内容
|
|
|
+ List<String> lines = readTextFromPDFUsingPdfBox(document);
|
|
|
+ //判断PDF是否存在水印 以及 水印类型(waterMarkType为null,代表没有水印)
|
|
|
+ WaterMarkEnum waterMark = getPdfWaterMarkType(lines);
|
|
|
+ //创建一个新的Excel工作簿
|
|
|
+ Workbook workbook = new XSSFWorkbook();
|
|
|
+ Sheet sheet = workbook.createSheet("Sheet1");
|
|
|
+ //1.处理header行(估值表,专用表,日期和单位净值)
|
|
|
+ Integer headerRowNumber = handlerExcelHeader(lines, sheet, waterMark);
|
|
|
+
|
|
|
+ //采用tabula技术提取表格内容
|
|
|
+ List<String> lineList = extractTableFormPDFUsingTabula(document, waterMark);
|
|
|
+ //2.额外处理标题行
|
|
|
+ Integer rowNumber = handlerTitleField(lineList, sheet, headerRowNumber);
|
|
|
+ //招商证券股份有限公司的估值表有特殊情况,存在两个标题字段列
|
|
|
+ boolean isSpecial = rowNumber == 7;
|
|
|
+ //3.合并标题行单元格
|
|
|
+ mergeExcelCell(sheet, isSpecial);
|
|
|
+ //4.遍历每一行并将其添加到Excel工作表中
|
|
|
+ readDateToExcel(sheet, lineList, rowNumber, waterMark);
|
|
|
+
|
|
|
+ //将Excel工作簿写入输出流
|
|
|
+ workbook.write(outputStream);
|
|
|
+ document.close();
|
|
|
+
|
|
|
+ ValuationPdfTransformToExcelDTO toExcelDTO = new ValuationPdfTransformToExcelDTO();
|
|
|
+ toExcelDTO.setOriginalFileName(multipartFile.getName());
|
|
|
+ toExcelDTO.setExcelFile(new File(excelFilePath));
|
|
|
+ toExcelDTO.setFileUrl(multipartFile.getAbsolutePath());
|
|
|
+ pdfToExcelDTOList.add(toExcelDTO);
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("pdf转为excel报错,堆栈:{}", ExceptionUtil.stacktraceToString(e));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return pdfToExcelDTOList;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 估值表_纽达投资利道永晟五号私募证券投资基金_SGL183_202312292024-02-26.pdf
|
|
|
+ *
|
|
|
+ * @param lines pdfBox读取PDF内容中的水印文字
|
|
|
+ * @param waterMark 水印
|
|
|
+ */
|
|
|
+ private List<String> removeWaterMark(List<String> lines, WaterMarkEnum waterMark) {
|
|
|
+ if (waterMark == null) {
|
|
|
+ return lines;
|
|
|
+ }
|
|
|
+ String regex = waterMark.getRegex();
|
|
|
+ return lines.stream().map(e -> e.replaceAll(regex, "")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+ private WaterMarkEnum getPdfWaterMarkType(List<String> lines) {
|
|
|
+ if (CollUtil.isEmpty(lines)) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ //pdf文本内容
|
|
|
+ String content = String.join("", lines);
|
|
|
+ for (WaterMarkEnum waterMarkEnum : WaterMarkEnum.values()) {
|
|
|
+ boolean containsAllKeywords = true;
|
|
|
+ String waterMarkText = waterMarkEnum.getText();
|
|
|
+ String[] waterMarkTextArray = StrUtil.cut(waterMarkText, 1);
|
|
|
+ for (String text : waterMarkTextArray) {
|
|
|
+ if (!content.contains(text)) {
|
|
|
+ containsAllKeywords = false;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (containsAllKeywords) {
|
|
|
+ return waterMarkEnum;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<String> extractTableFormPDFUsingTabula(PDDocument document, WaterMarkEnum waterMark) {
|
|
|
+ StrBuilder tableDataText = new StrBuilder();
|
|
|
+ PageIterator extract = new ObjectExtractor(document).extract();
|
|
|
+ String regex = waterMark != null ? waterMark.getRegex() : null;
|
|
|
+ if (StrUtil.isBlank(regex)) {
|
|
|
+ while (extract.hasNext()) {
|
|
|
+ Page next = extract.next();
|
|
|
+ List<Table> tableList = new SpreadsheetExtractionAlgorithm().extract(next);
|
|
|
+ for (Table table : tableList) {
|
|
|
+ List<List<RectangularTextContainer>> rows = table.getRows();
|
|
|
+ for (List<RectangularTextContainer> row : rows) {
|
|
|
+ String collect = row.stream().map(RectangularTextContainer::getText)
|
|
|
+ .map(e -> e.replace(" ", ""))
|
|
|
+ .collect(Collectors.joining(" "));
|
|
|
+ tableDataText.append("\n").append(collect);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ while (extract.hasNext()) {
|
|
|
+ Page next = extract.next();
|
|
|
+ List<Table> tableList = new SpreadsheetExtractionAlgorithm().extract(next);
|
|
|
+ for (Table table : tableList) {
|
|
|
+ List<List<RectangularTextContainer>> rows = table.getRows();
|
|
|
+ for (List<RectangularTextContainer> row : rows) {
|
|
|
+ String collect = row.stream().map(RectangularTextContainer::getText)
|
|
|
+ .map(e -> e.replace(" ", ""))
|
|
|
+ .map(e -> e.replaceAll(regex, ""))
|
|
|
+ .collect(Collectors.joining(" "));
|
|
|
+ tableDataText.append("\n").append(collect);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //将文本内容按行分割
|
|
|
+ return Arrays.stream(tableDataText.toString().trim().split("\\n")).collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ private List<String> readTextFromPDFUsingPdfBox(PDDocument document) {
|
|
|
+ PDFTextStripper pdfTextStripper = new PDFTextStripper();
|
|
|
+ pdfTextStripper.setSortByPosition(true);
|
|
|
+ String text = "";
|
|
|
+ try {
|
|
|
+ text = pdfTextStripper.getText(document);
|
|
|
+ } catch (IOException e) {
|
|
|
+ log.error("error: read text from pdf...", e);
|
|
|
+ }
|
|
|
+ //将文本内容按行分割
|
|
|
+ return Arrays.stream(text.split("\\n")).collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 处理excel的表头行(估值表行,专用表行,日期和单位净值行)
|
|
|
+ *
|
|
|
+ * @param lines PDF每行内容
|
|
|
+ * @param sheet 表格
|
|
|
+ * @param waterMark 水印
|
|
|
+ * @return 返回header所占的行数
|
|
|
+ */
|
|
|
+ private Integer handlerExcelHeader(List<String> lines, Sheet sheet, WaterMarkEnum waterMark) {
|
|
|
+ //去掉水印文字(去掉pdfBox读取PDF内容中的水印文字)
|
|
|
+ List<String> noWaterMarkText = removeWaterMark(lines, waterMark);
|
|
|
+ //去掉以空格和换行字符开头的数据行:" \r"(国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表)
|
|
|
+ noWaterMarkText = noWaterMarkText.stream().filter(this::filterHeadLine).collect(Collectors.toList());
|
|
|
+ //遇到"科目代码"行之前的前三行数据为header行
|
|
|
+ List<String> headLineList = CollUtil.newArrayList();
|
|
|
+ for (String line : noWaterMarkText) {
|
|
|
+ if ((StrUtil.isNotBlank(line) && line.trim().startsWith("科目代码"))) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ headLineList.add(line);
|
|
|
+ }
|
|
|
+ //head行过滤掉非以"科目"开头的数据行
|
|
|
+ headLineList = headLineList.stream().filter(e -> e.startsWith("科目") || filterTitleField(e)).collect(Collectors.toList());
|
|
|
+
|
|
|
+ //将head行写入表格
|
|
|
+ for (int i = 0; i < headLineList.size(); i++) {
|
|
|
+ Row row = sheet.createRow(i);
|
|
|
+ List<String> split = splitHeaderRow(headLineList.get(i));
|
|
|
+ for (int i1 = 0; i1 < split.size(); i1++) {
|
|
|
+ Cell cell = row.createCell(i1);
|
|
|
+ cell.setCellValue(split.get(i1));
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return sheet.getLastRowNum() + 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 分隔和合并head行的内容
|
|
|
+ *
|
|
|
+ * @param headRowValue head行的内容
|
|
|
+ * @return 处理后的head行的内容
|
|
|
+ */
|
|
|
+ private List<String> splitHeaderRow(String headRowValue) {
|
|
|
+ if (StrUtil.isBlank(headRowValue)) {
|
|
|
+ return CollUtil.newArrayList();
|
|
|
+ }
|
|
|
+ //国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表(pdfBox识别不到"_")
|
|
|
+ if (headRowValue.contains("国泰君安证券股份有限公司") && !headRowValue.contains("_")) {
|
|
|
+ List<String> collect = Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
|
|
|
+ return ListUtil.toList(String.join("_", collect));
|
|
|
+ }
|
|
|
+ return Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ private boolean filterHeadLine(String text) {
|
|
|
+ if (StrUtil.isBlank(text)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ if (text.trim().startsWith("成本")) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ String replace = text.replace("\r", "").replace(" ", "");
|
|
|
+ return !StrUtil.isBlank(replace);
|
|
|
+ }
|
|
|
+
|
|
|
+ private void readDateToExcel(Sheet sheet, List<String> lines, Integer rowNumber, WaterMarkEnum waterMark) {
|
|
|
+ int baseRowNumber = 0;
|
|
|
+ for (int i = 0; i <= sheet.getLastRowNum(); i++) {
|
|
|
+ if ("科目代码".equals(sheet.getRow(i).getCell(0).getStringCellValue())) {
|
|
|
+ baseRowNumber = i;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (String field : lines) {
|
|
|
+ //判断field数据是否要写入表格
|
|
|
+ boolean isNeedToWrite = canNeedToWrite(field);
|
|
|
+ if (!isNeedToWrite) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ Row row = sheet.createRow(rowNumber);
|
|
|
+ List<String> split = Arrays.stream(field.split(" ")).collect(Collectors.toList());
|
|
|
+ for (int i1 = 0; i1 < split.size(); i1++) {
|
|
|
+ Cell cell = row.createCell(i1);
|
|
|
+ //处理空格的问题
|
|
|
+ String cellValue = StrUtil.isBlank(split.get(i1)) ? null : split.get(i1).replace("*", "");
|
|
|
+ //要求为数值的字段,去掉非字符字段
|
|
|
+ if (StrUtil.isNotBlank(cellValue) && NUMBER_FIELD_LIST.contains(sheet.getRow(baseRowNumber).getCell(i1).getStringCellValue())) {
|
|
|
+ cellValue = filterNotNumber(cellValue);
|
|
|
+ }
|
|
|
+
|
|
|
+ //特殊处理:科目代码后面带有字母的,如1102.01.01.605056 SH、SQ、CFX等
|
|
|
+ if (i1 == 0 && i1 + 1 < split.size()) {
|
|
|
+ String nextCellValue = StrUtil.isBlank(split.get(i1 + 1)) ? null : split.get(i1 + 1).replace("*", "");
|
|
|
+ if (StrUtil.isNotBlank(nextCellValue) && nextCellValue.matches("^[A-Z]+$")) {
|
|
|
+ cellValue = (cellValue + " " + nextCellValue).trim();
|
|
|
+ split.remove(1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ cellValue = removeInfluenceOfWaterMark(cellValue, waterMark);
|
|
|
+ cell.setCellValue(cellValue);
|
|
|
+ }
|
|
|
+ rowNumber++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 处理水印文字造成的影响(总资产、资产净值两个字段的影响)
|
|
|
+ *
|
|
|
+ * @param cellValue 单元格内容
|
|
|
+ * @param waterMark 水印
|
|
|
+ * @return 无水印影响的单元格内容
|
|
|
+ */
|
|
|
+ private String removeInfluenceOfWaterMark(String cellValue, WaterMarkEnum waterMark) {
|
|
|
+ if (StrUtil.isBlank(cellValue) || waterMark == null) {
|
|
|
+ return cellValue;
|
|
|
+ }
|
|
|
+ String influenceText = StrUtil.isBlank(waterMark.getInfluenceText()) ? null : waterMark.getInfluenceText();
|
|
|
+ if (StrUtil.isBlank(influenceText)) {
|
|
|
+ return cellValue;
|
|
|
+ }
|
|
|
+ String rectifyText = waterMark.getRectifyText();
|
|
|
+ return RestoreCellValue(cellValue, influenceText, rectifyText);
|
|
|
+ }
|
|
|
+
|
|
|
+ private String RestoreCellValue(String cellValue, String influenceText, String rectifyText) {
|
|
|
+ if (StrUtil.isBlank(cellValue) || StrUtil.isBlank(influenceText) || StrUtil.isBlank(rectifyText)) {
|
|
|
+ return cellValue;
|
|
|
+ }
|
|
|
+ StringBuilder sb = new StringBuilder();
|
|
|
+
|
|
|
+ if ("产".equals(influenceText)) {
|
|
|
+ for (int i = 0; i < cellValue.length(); i++) {
|
|
|
+ if (cellValue.charAt(i) == '资') {
|
|
|
+ sb.append(cellValue.charAt(i));
|
|
|
+ sb.append("产");
|
|
|
+ } else {
|
|
|
+ sb.append(cellValue.charAt(i));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if ("资".equals(influenceText)) {
|
|
|
+ for (int i = 0; i < cellValue.length(); i++) {
|
|
|
+ if (cellValue.charAt(i) == '产') {
|
|
|
+ sb.append("资");
|
|
|
+ sb.append(cellValue.charAt(i));
|
|
|
+ } else {
|
|
|
+ sb.append(cellValue.charAt(i));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return sb.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ private String filterNotNumber(String cellValue) {
|
|
|
+ StringBuilder numberStr = new StringBuilder();
|
|
|
+ for (int i = 0; i < cellValue.length(); i++) {
|
|
|
+ char c = cellValue.charAt(i);
|
|
|
+ if (Character.isDigit(c) || c == '.' || c=='-') {
|
|
|
+ numberStr.append(c);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return numberStr.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean canNeedToWrite(String field) {
|
|
|
+ //过滤以字段列开头的行
|
|
|
+ boolean isCanContinue = filterTitleField(field);
|
|
|
+ if (!isCanContinue) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ //过滤包含"参考","第","页"的行
|
|
|
+ if (field.contains("参考") || (field.contains("第") && field.contains("页"))) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ //过滤掉空行
|
|
|
+ List<String> titleFieldList = Arrays.stream(field.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
|
|
|
+ if (CollUtil.isEmpty(titleFieldList)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ //过滤只有"%"的数据行
|
|
|
+ boolean allPercent = titleFieldList.stream().allMatch("%"::equals);
|
|
|
+ return !allPercent;
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean filterTitleField(String field) {
|
|
|
+ if (StrUtil.isBlank(field)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ for (String titleField : TITLE_FIELD_LIST) {
|
|
|
+ if (field.trim().startsWith(titleField)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 处理excel的字段标题列
|
|
|
+ *
|
|
|
+ * @param lines PDF内容
|
|
|
+ * @param sheet excel工作簿
|
|
|
+ * @param headerRowNumber header行所占的行数
|
|
|
+ */
|
|
|
+ private Integer handlerTitleField(List<String> lines, Sheet sheet, Integer headerRowNumber) {
|
|
|
+ Integer rowNumber = headerRowNumber;
|
|
|
+ //1.先筛选出属于标题行的内容
|
|
|
+ List<String> titleFieldList = getTitleFieldRow(lines);
|
|
|
+ lines.removeAll(titleFieldList);
|
|
|
+
|
|
|
+ //找到以"科目代码"开头的数据行
|
|
|
+ int skipNumber = 0;
|
|
|
+ for (String field : titleFieldList) {
|
|
|
+ if (field.trim().startsWith("科目代码")) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ skipNumber++;
|
|
|
+ }
|
|
|
+ titleFieldList = titleFieldList.stream().skip(skipNumber).collect(Collectors.toList());
|
|
|
+
|
|
|
+ //2.处理字段标题列
|
|
|
+ //1-字段行占1行:正常的字段列11列
|
|
|
+ if (titleFieldList.size() == 1) {
|
|
|
+ handleOneRowTitleField(titleFieldList, sheet, rowNumber);
|
|
|
+ return rowNumber + 1;
|
|
|
+ } else if (titleFieldList.size() == 2) {
|
|
|
+ //2-字段行占2行:第2行字段行一般是无用的字段:例如 %等
|
|
|
+ //处理: 识别不到"原币"、"本币"所在的数据行,导致表格的数据对不上字段列
|
|
|
+ String secondRowValue = titleFieldList.get(titleFieldList.size() - 1);
|
|
|
+ if (secondRowValue.trim().contains("十亿千百十万千")) {
|
|
|
+ int length = secondRowValue.trim().split(" ").length;
|
|
|
+ if (length >= 6) {
|
|
|
+ String extraRowValue = "原币 本币 原币 本币 原币 本币";
|
|
|
+ titleFieldList.add(1, extraRowValue);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (titleFieldList.size() == 3) {
|
|
|
+ handleThreeRowTitleField(titleFieldList, sheet, rowNumber);
|
|
|
+ rowNumber += 3;
|
|
|
+ } else {
|
|
|
+ handleOneRowTitleField(titleFieldList, sheet, rowNumber);
|
|
|
+ rowNumber++;
|
|
|
+ }
|
|
|
+ return rowNumber;
|
|
|
+ } else if (titleFieldList.size() == 3) {
|
|
|
+ //3-字段行占3行:有原币或者原币、本币字段的17列
|
|
|
+ handleThreeRowTitleField(titleFieldList, sheet, rowNumber);
|
|
|
+ return rowNumber + 3;
|
|
|
+ } else if (titleFieldList.size() == 4) {
|
|
|
+ //4-字段行占4行:存在两行"科目代码"的特殊字段列,14列(招商证券pdf估值表)
|
|
|
+ handleFourRowTitleField(titleFieldList, sheet, rowNumber);
|
|
|
+ return rowNumber + 4;
|
|
|
+ }
|
|
|
+
|
|
|
+ return rowNumber;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void handleFourRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
|
|
|
+ List<String> fisrtTitleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ List<String> secondTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ List<String> thirdTitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ List<String> fourTitleList = Arrays.stream(titleFieldList.get(3).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ //第一列"科目代码"
|
|
|
+ Row firstRow = sheet.createRow(rowNumber);
|
|
|
+ for (int i = 0; i < fisrtTitleList.size(); i++) {
|
|
|
+ Cell firstCell = firstRow.createCell(i);
|
|
|
+ firstCell.setCellValue(fisrtTitleList.get(i));
|
|
|
+ }
|
|
|
+ //第二个"科目代码"
|
|
|
+ Row secondRow = sheet.createRow(rowNumber + 1);
|
|
|
+ Row thirdRow = sheet.createRow(rowNumber + 2);
|
|
|
+ Row fourRow = sheet.createRow(rowNumber + 3);
|
|
|
+
|
|
|
+ int count = 0;
|
|
|
+ for (int index = 0; index < secondTitleList.size(); index++) {
|
|
|
+ Cell secondRowCell = secondRow.createCell(index);
|
|
|
+ Cell thirdCell = thirdRow.createCell(index);
|
|
|
+ Cell fourCell = fourRow.createCell(index);
|
|
|
+
|
|
|
+ String field = secondTitleList.get(index);
|
|
|
+ if (EXTRA_FIELD_LIST.contains(field)) {
|
|
|
+ secondRowCell.setCellValue(field);
|
|
|
+ thirdCell.setCellValue(thirdTitleList.get(count));
|
|
|
+ fourCell.setCellValue(fourTitleList.get(count));
|
|
|
+ count++;
|
|
|
+ } else {
|
|
|
+ secondRowCell.setCellValue(field);
|
|
|
+ thirdCell.setCellValue(field);
|
|
|
+ fourCell.setCellValue(field);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void handleThreeRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
|
|
|
+ //方案一:先填充每个字段,再合并单元格
|
|
|
+ List<String> baseTitleList = CollUtil.newArrayList();
|
|
|
+ List<String> nextTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ List<String> next2TitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+
|
|
|
+ //特殊处理下:如果包含了本币和原币,那么baseTitleList需要重复"成本", "市值", "估值增值"字段
|
|
|
+ List<String> titleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
|
|
|
+ if (nextTitleList.contains("原币") && nextTitleList.contains("本币")) {
|
|
|
+ for (String titleField : titleList) {
|
|
|
+ baseTitleList.add(titleField);
|
|
|
+ if (EXTRA_FIELD_LIST.contains(titleField)) {
|
|
|
+ baseTitleList.add(titleField);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if (nextTitleList.contains("本币") || nextTitleList.contains("原币")) {
|
|
|
+ baseTitleList.addAll(titleList);
|
|
|
+ }
|
|
|
+
|
|
|
+ Row firstRow = sheet.createRow(rowNumber);
|
|
|
+ Row secondRow = sheet.createRow(rowNumber + 1);
|
|
|
+ Row thirdRow = sheet.createRow(rowNumber + 2);
|
|
|
+
|
|
|
+ int count = 0;
|
|
|
+ for (int index = 0; index < baseTitleList.size(); index++) {
|
|
|
+ Cell firstCell = firstRow.createCell(index);
|
|
|
+ Cell secondRowCell = secondRow.createCell(index);
|
|
|
+ Cell thirdCell = thirdRow.createCell(index);
|
|
|
+
|
|
|
+ //特殊处理"成本", "市值", "估值增值"字段
|
|
|
+ String field = baseTitleList.get(index);
|
|
|
+ if (EXTRA_FIELD_LIST.contains(field)) {
|
|
|
+ if (nextTitleList.contains("原币") || nextTitleList.contains("本币")) {
|
|
|
+ firstCell.setCellValue(field);
|
|
|
+ secondRowCell.setCellValue(nextTitleList.get(count));
|
|
|
+ thirdCell.setCellValue(next2TitleList.get(count));
|
|
|
+ count++;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ firstCell.setCellValue(field);
|
|
|
+ secondRowCell.setCellValue(field);
|
|
|
+ thirdCell.setCellValue(field);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ private void handleOneRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
|
|
|
+ Row row = sheet.createRow(rowNumber);
|
|
|
+ List<String> baseTitleList = Arrays.stream(titleFieldList.get(0).split(" "))
|
|
|
+ .filter(e -> !" ".equals(e) && StrUtil.isNotBlank(e)).collect(Collectors.toList());
|
|
|
+ for (int i = 0; i < baseTitleList.size(); i++) {
|
|
|
+ Cell cell = row.createCell(i);
|
|
|
+ cell.setCellValue(baseTitleList.get(i));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 找到字段标题行
|
|
|
+ *
|
|
|
+ * @param lines PDF表格的所有行
|
|
|
+ * @return 字段标题行
|
|
|
+ */
|
|
|
+ private List<String> getTitleFieldRow(List<String> lines) {
|
|
|
+ List<String> titleFieldList = CollUtil.newArrayList();
|
|
|
+ for (String line : lines) {
|
|
|
+ //如果当前行的第一个值是整数(意味着当前的前一行为科目代码行了),那么说明已经找完所有标题行内容
|
|
|
+ List<String> rowValueList = Arrays.stream(line.split(" ")).collect(Collectors.toList());
|
|
|
+ String firstValueOfRow = CollUtil.isNotEmpty(rowValueList) ? rowValueList.get(0) : null;
|
|
|
+ if (isInteger(firstValueOfRow)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (StrUtil.isNotBlank(line)) {
|
|
|
+ titleFieldList.add(line);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return titleFieldList;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void mergeExcelCell(Sheet sheet, boolean isSpecial) {
|
|
|
+ if (sheet == null || sheet.getLastRowNum() < 4) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ //先找到"科目代码"字段列在excel中的行号
|
|
|
+ Integer mixRowNumber = null;
|
|
|
+ Integer maxRowNumber = null;
|
|
|
+ for (int i = 0; i < sheet.getLastRowNum(); i++) {
|
|
|
+ Row row = sheet.getRow(i);
|
|
|
+ //找第一个"科目代码"出现的行数
|
|
|
+ if ("科目代码".equals(row.getCell(0).toString())) {
|
|
|
+ mixRowNumber = mixRowNumber != null ? mixRowNumber : i;
|
|
|
+ mixRowNumber = isSpecial ? mixRowNumber + 1 : mixRowNumber;
|
|
|
+ isSpecial = false;
|
|
|
+ }
|
|
|
+ //找最后一个"科目代码"出现的行数
|
|
|
+ if (mixRowNumber != null && "科目代码".equals(row.getCell(0).toString()) && i == sheet.getLastRowNum() - 1) {
|
|
|
+ maxRowNumber = i + 1;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //参数分别为:起始行号、结束行号、起始列号、结束列号
|
|
|
+ if (mixRowNumber != null && maxRowNumber != null && !mixRowNumber.equals(maxRowNumber)) {
|
|
|
+ Row firstRow = sheet.getRow(mixRowNumber);
|
|
|
+ Row lastRow = sheet.getRow(maxRowNumber);
|
|
|
+ int rowNum = firstRow.getLastCellNum();
|
|
|
+ for (int i = 0; i < rowNum; i++) {
|
|
|
+ //不同行,同一列的上下单元格合并
|
|
|
+ if (firstRow.getCell(i).toString().equals(lastRow.getCell(i).toString())) {
|
|
|
+ sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, maxRowNumber, i, i));
|
|
|
+ }
|
|
|
+ //同一行的左右两个单元格合并
|
|
|
+ if (i + 1 < rowNum && firstRow.getCell(i).toString().equals(firstRow.getCell(i + 1).toString())) {
|
|
|
+ sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, mixRowNumber, i, i + 1));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ public boolean isInteger(String str) {
|
|
|
+ if (StrUtil.isBlank(str)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ return str.matches("^-?\\d+$");
|
|
|
+ }
|
|
|
+
|
|
|
+ public File saveFile(MultipartFile file, String excelUploadDir, String newExcelName) {
|
|
|
+ File targetFile = null;
|
|
|
+ try {
|
|
|
+ targetFile = new File(excelUploadDir, newExcelName);
|
|
|
+ FileUtils.writeByteArrayToFile(targetFile, file.getBytes());
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ log.error("save file error,error -> {}", e.getMessage());
|
|
|
+ return targetFile;
|
|
|
+ }
|
|
|
+ return targetFile;
|
|
|
+ }
|
|
|
+
|
|
|
+ public File saveFile(File file, String excelUploadDir, String newExcelName) {
|
|
|
+ File targetFile = null;
|
|
|
+ try {
|
|
|
+ targetFile = new File(excelUploadDir, newExcelName);
|
|
|
+ FileUtils.writeByteArrayToFile(targetFile, Files.readAllBytes(file.toPath()));
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ log.error("save file error,error -> {}", e.getMessage());
|
|
|
+ return targetFile;
|
|
|
+ }
|
|
|
+ return targetFile;
|
|
|
+ }
|
|
|
+
|
|
|
+}
|