Przeglądaj źródła

feat:邮件解析-支持pdf估值表邮件解析

mozuwen 7 miesięcy temu
rodzic
commit
5283c90160

+ 74 - 0
service-base/src/main/java/com/simuwang/base/common/enums/WaterMarkEnum.java

@@ -0,0 +1,74 @@
+package com.simuwang.base.common.enums;
+
+/**
+ * @author mozuwen
+ * @description PDF水印类型枚举
+ * @date 2024/03/14
+ */
+public enum WaterMarkEnum {
+
+    /**
+     * 国泰君安证券水印
+     */
+    GUOTAI_JUNAN_SECURITIES("仅供中原证券逐鹿中原杯使用", "",  "", ""),
+
+    /**
+     * 招商证券水印
+     */
+    CHINA_MERCHANTS_SECURITIES_1("业绩证明复制无效谨防假冒", "业|绩|证|明|复|制|无|效|谨|防|假|冒",  "", ""),
+
+    /**
+     * 招商证券水印
+     */
+    CHINA_MERCHANTS_SECURITIES_2("产品尽调复制无效谨防假冒", "产|品|尽|调|复|制|无|效|谨|防|假|冒",  "产", "资"),
+
+    /**
+     * 招商证券水印
+     */
+    CHINA_MERCHANTS_SECURITIES_3("公司及产品审计复制无效谨防假冒", "公|司|及|产|品|审|计|复|制|无|效|谨|防|假|冒",  "产", "资"),
+
+    /**
+     * 招商证券水印
+     */
+    CHINA_MERCHANTS_SECURITIES_4("其他场景复制无效谨防假冒", "其|他|场|景|复|制|无|效|谨|防|假|冒",  "", ""),
+
+    /**
+     * 招商证券水印
+     */
+    CHINA_MERCHANTS_SECURITIES_5("复制无效谨防假冒", "复|制|无|效|谨|防|假|冒",  "", ""),
+
+    /**
+     * 中信建设证券水印
+     */
+    CITIC_SECURITIES("仅供申请打新资格及网下配售使用", "仅|供|申|请|打|新|资|格|及|网|下|配|售|使|用",  "资", "产");
+
+
+    private final String text;
+    private final String regex;
+    private final String influenceText;
+    private final String rectifyText;
+
+    WaterMarkEnum(String text, String regex, String influenceText, String rectifyText) {
+        this.text = text;
+        this.regex = regex;
+        this.influenceText = influenceText;
+        this.rectifyText = rectifyText;
+    }
+
+    public String getText() {
+        return text;
+    }
+
+    public String getRegex() {
+        return regex;
+    }
+
+    public String getInfluenceText() {
+        return influenceText;
+    }
+
+    public String getRectifyText() {
+        return rectifyText;
+    }
+
+}

+ 1 - 2
service-base/src/main/java/com/simuwang/base/pojo/dos/ValuationTableDO.java

@@ -6,7 +6,6 @@ import com.baomidou.mybatisplus.annotation.TableName;
 import lombok.Data;
 
 import java.math.BigDecimal;
-import java.time.LocalDate;
 import java.util.Date;
 
 @Data
@@ -53,7 +52,7 @@ public class ValuationTableDO {
     @TableField(value = "original_file")
     private String originalFile;
     /**
-     * 转换后的文件名称(原文件名加密)
+     * pdf转化为excel文件的路径
      */
     @TableField(value = "converted_file")
     private String convertedFile;

+ 25 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/ValuationPdfTransformToExcelDTO.java

@@ -0,0 +1,25 @@
+package com.simuwang.base.pojo.dto;
+
+import lombok.Data;
+
+import java.io.File;
+
+@Data
+public class ValuationPdfTransformToExcelDTO {
+
+    /**
+     * pdf文件名
+     */
+    private String originalFileName;
+
+    /**
+     * pdf转化为excel后的文件
+     */
+    private File excelFile;
+
+    /**
+     * pdf文件所在的地址
+     */
+    private String fileUrl;
+
+}

+ 2 - 3
service-daq/src/main/java/com/simuwang/daq/service/EmailParseApiServiceImpl.java

@@ -7,7 +7,6 @@ import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.common.conts.DateConst;
 import com.simuwang.base.common.util.EmailUtil;
-import com.simuwang.base.common.util.ExcelUtil;
 import com.simuwang.base.mapper.EmailFileInfoMapper;
 import com.simuwang.base.mapper.EmailParseInfoMapper;
 import com.simuwang.base.pojo.dos.EmailFileInfoDO;
@@ -81,7 +80,7 @@ public class EmailParseApiServiceImpl implements EmailParseApiService {
                 fileNameNavMap.put(emailContentInfoDTO, fundNavDTOList);
                 emailFundNavDTOList.addAll(fundNavDTOList);
             } catch (Exception e) {
-                log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
+                log.error("重新解析邮件失败,邮件id:{},堆栈信息:{}", emailId, ExceptionUtil.stacktraceToString(e));
             }
         }
         // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
@@ -112,7 +111,7 @@ public class EmailParseApiServiceImpl implements EmailParseApiService {
     }
 
     public static String getEmailContent(String filePath) {
-        if (StrUtil.isNotBlank(filePath) && ExcelUtil.isPdf(filePath)) {
+        if (StrUtil.isNotBlank(filePath) && filePath.endsWith("html")) {
             StringBuilder content = new StringBuilder();
             try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
                 String line;

+ 659 - 0
service-daq/src/main/java/com/simuwang/daq/service/PdfToExcelService.java

@@ -0,0 +1,659 @@
+package com.simuwang.daq.service;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.exceptions.ExceptionUtil;
+import cn.hutool.core.text.StrBuilder;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.enums.WaterMarkEnum;
+import com.simuwang.base.common.util.ExcelUtil;
+import com.simuwang.base.pojo.dto.ValuationPdfTransformToExcelDTO;
+import org.apache.commons.io.FileUtils;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.util.CellRangeAddress;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+import org.springframework.web.multipart.MultipartFile;
+import technology.tabula.*;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+@Component
+public class PdfToExcelService {
+
+    private static final Logger log = LoggerFactory.getLogger(PdfToExcelService.class);
+
+    @Value("${email.file.path}")
+    private String path;
+
+    private static final List<String> EXTRA_FIELD_LIST = ListUtil.toList("成本", "市值", "估值增值");
+    private static final List<String> NUMBER_FIELD_LIST = ListUtil.toList("数量", "单位成本", "成本", "成本占净值",
+            "成本占净值%", "成本占净值", "市价", "市值", "市值占净值比", "市值占净值比%");
+    private static final List<String> TITLE_FIELD_LIST = ListUtil.toList("单位净值:", "位净值:", "估值日期:", "值日期:", "单位:",
+            "科目代码", "科目名", "数量", "单位成本", "位成本", "成本", "成本占净值%", "本币", "十亿千百十",
+            "成本占净值", "市价", "市值", "值", "市值占净值%", "市值占净", "估值增值", "停牌信息", "权益信息");
+    public static final List<String> TOTAL_MARKET_FIELD = ListUtil.toList("资合计", "资类合计:", "资类合计:", "资净值", "基金资净值:",
+            "基金资净值:", "资产资净值", "资资净值:", "资资净值:", "信托资净值:", "信托资净值:");
+
+    /**
+     * PDF类型的估值表文件转为excel类型
+     *
+     * @param fileList 待转换的文件
+     * @return 转换为excel类型的文件
+     */
+    public List<ValuationPdfTransformToExcelDTO> pdfToExcel(List<File> fileList) {
+        if (fileList == null) {
+            return CollUtil.newArrayList();
+        }
+        List<File> pdfFileList = fileList.stream()
+                .filter(e -> ExcelUtil.isPdf(e.getName())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(pdfFileList)) {
+            return CollUtil.newArrayList();
+        }
+        List<ValuationPdfTransformToExcelDTO> pdfToExcelDTOList = CollUtil.newArrayList();
+        String excelUploadDir = path + File.separator + "valuation_table_excel" + File.separator;
+        for (File multipartFile : pdfFileList) {
+            String excelFilePath = excelUploadDir + multipartFile.getName().replace(".pdf", "") + ".xlsx";
+            File savefile = new File(excelFilePath);
+            if (!savefile.exists()) {
+                if (!savefile.getParentFile().exists()) {
+                    savefile.getParentFile().mkdirs();
+                    savefile.getParentFile().setExecutable(true);
+                }
+            }
+            try (OutputStream outputStream = Files.newOutputStream(Paths.get(excelFilePath))) {
+                PDDocument document = Loader.loadPDF(multipartFile);
+                //采用pdfBox读取PDF内容
+                List<String> lines = readTextFromPDFUsingPdfBox(document);
+                //判断PDF是否存在水印 以及 水印类型(waterMarkType为null,代表没有水印)
+                WaterMarkEnum waterMark = getPdfWaterMarkType(lines);
+                //创建一个新的Excel工作簿
+                Workbook workbook = new XSSFWorkbook();
+                Sheet sheet = workbook.createSheet("Sheet1");
+                //1.处理header行(估值表,专用表,日期和单位净值)
+                Integer headerRowNumber = handlerExcelHeader(lines, sheet, waterMark);
+
+                //采用tabula技术提取表格内容
+                List<String> lineList = extractTableFormPDFUsingTabula(document, waterMark);
+                //2.额外处理标题行
+                Integer rowNumber = handlerTitleField(lineList, sheet, headerRowNumber);
+                //招商证券股份有限公司的估值表有特殊情况,存在两个标题字段列
+                boolean isSpecial = rowNumber == 7;
+                //3.合并标题行单元格
+                mergeExcelCell(sheet, isSpecial);
+                //4.遍历每一行并将其添加到Excel工作表中
+                readDateToExcel(sheet, lineList, rowNumber, waterMark);
+
+                //将Excel工作簿写入输出流
+                workbook.write(outputStream);
+                document.close();
+
+                ValuationPdfTransformToExcelDTO toExcelDTO = new ValuationPdfTransformToExcelDTO();
+                toExcelDTO.setOriginalFileName(multipartFile.getName());
+                toExcelDTO.setExcelFile(new File(excelFilePath));
+                toExcelDTO.setFileUrl(multipartFile.getAbsolutePath());
+                pdfToExcelDTOList.add(toExcelDTO);
+
+            } catch (Exception e) {
+                log.error("pdf转为excel报错,堆栈:{}", ExceptionUtil.stacktraceToString(e));
+            }
+        }
+        return pdfToExcelDTOList;
+    }
+
+    /**
+     * 估值表_纽达投资利道永晟五号私募证券投资基金_SGL183_202312292024-02-26.pdf
+     *
+     * @param lines     pdfBox读取PDF内容中的水印文字
+     * @param waterMark 水印
+     */
+    private List<String> removeWaterMark(List<String> lines, WaterMarkEnum waterMark) {
+        if (waterMark == null) {
+            return lines;
+        }
+        String regex = waterMark.getRegex();
+        return lines.stream().map(e -> e.replaceAll(regex, "")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
+    }
+
+    private WaterMarkEnum getPdfWaterMarkType(List<String> lines) {
+        if (CollUtil.isEmpty(lines)) {
+            return null;
+        }
+        //pdf文本内容
+        String content = String.join("", lines);
+        for (WaterMarkEnum waterMarkEnum : WaterMarkEnum.values()) {
+            boolean containsAllKeywords = true;
+            String waterMarkText = waterMarkEnum.getText();
+            String[] waterMarkTextArray = StrUtil.cut(waterMarkText, 1);
+            for (String text : waterMarkTextArray) {
+                if (!content.contains(text)) {
+                    containsAllKeywords = false;
+                    break;
+                }
+            }
+            if (containsAllKeywords) {
+                return waterMarkEnum;
+            }
+        }
+        return null;
+    }
+
+    private List<String> extractTableFormPDFUsingTabula(PDDocument document, WaterMarkEnum waterMark) {
+        StrBuilder tableDataText = new StrBuilder();
+        PageIterator extract = new ObjectExtractor(document).extract();
+        String regex = waterMark != null ? waterMark.getRegex() : null;
+        if (StrUtil.isBlank(regex)) {
+            while (extract.hasNext()) {
+                Page next = extract.next();
+                List<Table> tableList = new SpreadsheetExtractionAlgorithm().extract(next);
+                for (Table table : tableList) {
+                    List<List<RectangularTextContainer>> rows = table.getRows();
+                    for (List<RectangularTextContainer> row : rows) {
+                        String collect = row.stream().map(RectangularTextContainer::getText)
+                                .map(e -> e.replace(" ", ""))
+                                .collect(Collectors.joining(" "));
+                        tableDataText.append("\n").append(collect);
+                    }
+                }
+            }
+        } else {
+            while (extract.hasNext()) {
+                Page next = extract.next();
+                List<Table> tableList = new SpreadsheetExtractionAlgorithm().extract(next);
+                for (Table table : tableList) {
+                    List<List<RectangularTextContainer>> rows = table.getRows();
+                    for (List<RectangularTextContainer> row : rows) {
+                        String collect = row.stream().map(RectangularTextContainer::getText)
+                                .map(e -> e.replace(" ", ""))
+                                .map(e -> e.replaceAll(regex, ""))
+                                .collect(Collectors.joining(" "));
+                        tableDataText.append("\n").append(collect);
+                    }
+                }
+            }
+        }
+        //将文本内容按行分割
+        return Arrays.stream(tableDataText.toString().trim().split("\\n")).collect(Collectors.toList());
+    }
+
+
+    private List<String> readTextFromPDFUsingPdfBox(PDDocument document) {
+        PDFTextStripper pdfTextStripper = new PDFTextStripper();
+        pdfTextStripper.setSortByPosition(true);
+        String text = "";
+        try {
+            text = pdfTextStripper.getText(document);
+        } catch (IOException e) {
+            log.error("error: read text from pdf...", e);
+        }
+        //将文本内容按行分割
+        return Arrays.stream(text.split("\\n")).collect(Collectors.toList());
+    }
+
+    /**
+     * 处理excel的表头行(估值表行,专用表行,日期和单位净值行)
+     *
+     * @param lines     PDF每行内容
+     * @param sheet     表格
+     * @param waterMark 水印
+     * @return 返回header所占的行数
+     */
+    private Integer handlerExcelHeader(List<String> lines, Sheet sheet, WaterMarkEnum waterMark) {
+        //去掉水印文字(去掉pdfBox读取PDF内容中的水印文字)
+        List<String> noWaterMarkText = removeWaterMark(lines, waterMark);
+        //去掉以空格和换行字符开头的数据行:" \r"(国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表)
+        noWaterMarkText = noWaterMarkText.stream().filter(this::filterHeadLine).collect(Collectors.toList());
+        //遇到"科目代码"行之前的前三行数据为header行
+        List<String> headLineList = CollUtil.newArrayList();
+        for (String line : noWaterMarkText) {
+            if ((StrUtil.isNotBlank(line) && line.trim().startsWith("科目代码"))) {
+                break;
+            }
+            headLineList.add(line);
+        }
+        //head行过滤掉非以"科目"开头的数据行
+        headLineList = headLineList.stream().filter(e -> e.startsWith("科目") || filterTitleField(e)).collect(Collectors.toList());
+
+        //将head行写入表格
+        for (int i = 0; i < headLineList.size(); i++) {
+            Row row = sheet.createRow(i);
+            List<String> split = splitHeaderRow(headLineList.get(i));
+            for (int i1 = 0; i1 < split.size(); i1++) {
+                Cell cell = row.createCell(i1);
+                cell.setCellValue(split.get(i1));
+
+            }
+        }
+        return sheet.getLastRowNum() + 1;
+    }
+
+    /**
+     * 分隔和合并head行的内容
+     *
+     * @param headRowValue head行的内容
+     * @return 处理后的head行的内容
+     */
+    private List<String> splitHeaderRow(String headRowValue) {
+        if (StrUtil.isBlank(headRowValue)) {
+            return CollUtil.newArrayList();
+        }
+        //国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表(pdfBox识别不到"_")
+        if (headRowValue.contains("国泰君安证券股份有限公司") && !headRowValue.contains("_")) {
+            List<String> collect = Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
+            return ListUtil.toList(String.join("_", collect));
+        }
+        return Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
+    }
+
+
+    private boolean filterHeadLine(String text) {
+        if (StrUtil.isBlank(text)) {
+            return false;
+        }
+        if (text.trim().startsWith("成本")) {
+            return false;
+        }
+        String replace = text.replace("\r", "").replace(" ", "");
+        return !StrUtil.isBlank(replace);
+    }
+
+    private void readDateToExcel(Sheet sheet, List<String> lines, Integer rowNumber, WaterMarkEnum waterMark) {
+        int baseRowNumber = 0;
+        for (int i = 0; i <= sheet.getLastRowNum(); i++) {
+            if ("科目代码".equals(sheet.getRow(i).getCell(0).getStringCellValue())) {
+                baseRowNumber = i;
+                break;
+            }
+        }
+        for (String field : lines) {
+            //判断field数据是否要写入表格
+            boolean isNeedToWrite = canNeedToWrite(field);
+            if (!isNeedToWrite) {
+                continue;
+            }
+            Row row = sheet.createRow(rowNumber);
+            List<String> split = Arrays.stream(field.split(" ")).collect(Collectors.toList());
+            for (int i1 = 0; i1 < split.size(); i1++) {
+                Cell cell = row.createCell(i1);
+                //处理空格的问题
+                String cellValue = StrUtil.isBlank(split.get(i1)) ? null : split.get(i1).replace("*", "");
+                //要求为数值的字段,去掉非字符字段
+                if (StrUtil.isNotBlank(cellValue) && NUMBER_FIELD_LIST.contains(sheet.getRow(baseRowNumber).getCell(i1).getStringCellValue())) {
+                    cellValue = filterNotNumber(cellValue);
+                }
+
+                //特殊处理:科目代码后面带有字母的,如1102.01.01.605056 SH、SQ、CFX等
+                if (i1 == 0 && i1 + 1 < split.size()) {
+                    String nextCellValue = StrUtil.isBlank(split.get(i1 + 1)) ? null : split.get(i1 + 1).replace("*", "");
+                    if (StrUtil.isNotBlank(nextCellValue) && nextCellValue.matches("^[A-Z]+$")) {
+                        cellValue = (cellValue + " " + nextCellValue).trim();
+                        split.remove(1);
+                    }
+                }
+                cellValue = removeInfluenceOfWaterMark(cellValue, waterMark);
+                cell.setCellValue(cellValue);
+            }
+            rowNumber++;
+        }
+    }
+
+    /**
+     * 处理水印文字造成的影响(总资产、资产净值两个字段的影响)
+     *
+     * @param cellValue 单元格内容
+     * @param waterMark 水印
+     * @return 无水印影响的单元格内容
+     */
+    private String removeInfluenceOfWaterMark(String cellValue, WaterMarkEnum waterMark) {
+        if (StrUtil.isBlank(cellValue) || waterMark == null) {
+            return cellValue;
+        }
+        String influenceText = StrUtil.isBlank(waterMark.getInfluenceText()) ? null : waterMark.getInfluenceText();
+        if (StrUtil.isBlank(influenceText)) {
+            return cellValue;
+        }
+        String rectifyText = waterMark.getRectifyText();
+        return RestoreCellValue(cellValue, influenceText, rectifyText);
+    }
+
+    private String RestoreCellValue(String cellValue, String influenceText, String rectifyText) {
+        if (StrUtil.isBlank(cellValue) || StrUtil.isBlank(influenceText) || StrUtil.isBlank(rectifyText)) {
+            return cellValue;
+        }
+        StringBuilder sb = new StringBuilder();
+
+        if ("产".equals(influenceText)) {
+            for (int i = 0; i < cellValue.length(); i++) {
+                if (cellValue.charAt(i) == '资') {
+                    sb.append(cellValue.charAt(i));
+                    sb.append("产");
+                } else {
+                    sb.append(cellValue.charAt(i));
+                }
+            }
+        }
+
+        if ("资".equals(influenceText)) {
+            for (int i = 0; i < cellValue.length(); i++) {
+                if (cellValue.charAt(i) == '产') {
+                    sb.append("资");
+                    sb.append(cellValue.charAt(i));
+                } else {
+                    sb.append(cellValue.charAt(i));
+                }
+            }
+        }
+        return sb.toString();
+    }
+
+    private String filterNotNumber(String cellValue) {
+        StringBuilder numberStr = new StringBuilder();
+        for (int i = 0; i < cellValue.length(); i++) {
+            char c = cellValue.charAt(i);
+            if (Character.isDigit(c) || c == '.' || c=='-') {
+                numberStr.append(c);
+            }
+        }
+        return numberStr.toString();
+    }
+
+    private boolean canNeedToWrite(String field) {
+        //过滤以字段列开头的行
+        boolean isCanContinue = filterTitleField(field);
+        if (!isCanContinue) {
+            return false;
+        }
+        //过滤包含"参考","第","页"的行
+        if (field.contains("参考") || (field.contains("第") && field.contains("页"))) {
+            return false;
+        }
+
+        //过滤掉空行
+        List<String> titleFieldList = Arrays.stream(field.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList());
+        if (CollUtil.isEmpty(titleFieldList)) {
+            return false;
+        }
+        //过滤只有"%"的数据行
+        boolean allPercent = titleFieldList.stream().allMatch("%"::equals);
+        return !allPercent;
+    }
+
+    private boolean filterTitleField(String field) {
+        if (StrUtil.isBlank(field)) {
+            return false;
+        }
+        for (String titleField : TITLE_FIELD_LIST) {
+            if (field.trim().startsWith(titleField)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * 处理excel的字段标题列
+     *
+     * @param lines           PDF内容
+     * @param sheet           excel工作簿
+     * @param headerRowNumber header行所占的行数
+     */
+    private Integer handlerTitleField(List<String> lines, Sheet sheet, Integer headerRowNumber) {
+        Integer rowNumber = headerRowNumber;
+        //1.先筛选出属于标题行的内容
+        List<String> titleFieldList = getTitleFieldRow(lines);
+        lines.removeAll(titleFieldList);
+
+        //找到以"科目代码"开头的数据行
+        int skipNumber = 0;
+        for (String field : titleFieldList) {
+            if (field.trim().startsWith("科目代码")) {
+                break;
+            }
+            skipNumber++;
+        }
+        titleFieldList = titleFieldList.stream().skip(skipNumber).collect(Collectors.toList());
+
+        //2.处理字段标题列
+        //1-字段行占1行:正常的字段列11列
+        if (titleFieldList.size() == 1) {
+            handleOneRowTitleField(titleFieldList, sheet, rowNumber);
+            return rowNumber + 1;
+        } else if (titleFieldList.size() == 2) {
+            //2-字段行占2行:第2行字段行一般是无用的字段:例如 %等
+            //处理: 识别不到"原币"、"本币"所在的数据行,导致表格的数据对不上字段列
+            String secondRowValue = titleFieldList.get(titleFieldList.size() - 1);
+            if (secondRowValue.trim().contains("十亿千百十万千")) {
+                int length = secondRowValue.trim().split(" ").length;
+                if (length >= 6) {
+                    String extraRowValue = "原币 本币 原币 本币 原币 本币";
+                    titleFieldList.add(1, extraRowValue);
+                }
+            }
+            if (titleFieldList.size() == 3) {
+                handleThreeRowTitleField(titleFieldList, sheet, rowNumber);
+                rowNumber += 3;
+            } else {
+                handleOneRowTitleField(titleFieldList, sheet, rowNumber);
+                rowNumber++;
+            }
+            return rowNumber;
+        } else if (titleFieldList.size() == 3) {
+            //3-字段行占3行:有原币或者原币、本币字段的17列
+            handleThreeRowTitleField(titleFieldList, sheet, rowNumber);
+            return rowNumber + 3;
+        } else if (titleFieldList.size() == 4) {
+            //4-字段行占4行:存在两行"科目代码"的特殊字段列,14列(招商证券pdf估值表)
+            handleFourRowTitleField(titleFieldList, sheet, rowNumber);
+            return rowNumber + 4;
+        }
+
+        return rowNumber;
+    }
+
+    private void handleFourRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
+        List<String> fisrtTitleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        List<String> secondTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        List<String> thirdTitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        List<String> fourTitleList = Arrays.stream(titleFieldList.get(3).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        //第一列"科目代码"
+        Row firstRow = sheet.createRow(rowNumber);
+        for (int i = 0; i < fisrtTitleList.size(); i++) {
+            Cell firstCell = firstRow.createCell(i);
+            firstCell.setCellValue(fisrtTitleList.get(i));
+        }
+        //第二个"科目代码"
+        Row secondRow = sheet.createRow(rowNumber + 1);
+        Row thirdRow = sheet.createRow(rowNumber + 2);
+        Row fourRow = sheet.createRow(rowNumber + 3);
+
+        int count = 0;
+        for (int index = 0; index < secondTitleList.size(); index++) {
+            Cell secondRowCell = secondRow.createCell(index);
+            Cell thirdCell = thirdRow.createCell(index);
+            Cell fourCell = fourRow.createCell(index);
+
+            String field = secondTitleList.get(index);
+            if (EXTRA_FIELD_LIST.contains(field)) {
+                secondRowCell.setCellValue(field);
+                thirdCell.setCellValue(thirdTitleList.get(count));
+                fourCell.setCellValue(fourTitleList.get(count));
+                count++;
+            } else {
+                secondRowCell.setCellValue(field);
+                thirdCell.setCellValue(field);
+                fourCell.setCellValue(field);
+            }
+        }
+    }
+
+    private void handleThreeRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
+        //方案一:先填充每个字段,再合并单元格
+        List<String> baseTitleList = CollUtil.newArrayList();
+        List<String> nextTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        List<String> next2TitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+
+        //特殊处理下:如果包含了本币和原币,那么baseTitleList需要重复"成本", "市值", "估值增值"字段
+        List<String> titleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList());
+        if (nextTitleList.contains("原币") && nextTitleList.contains("本币")) {
+            for (String titleField : titleList) {
+                baseTitleList.add(titleField);
+                if (EXTRA_FIELD_LIST.contains(titleField)) {
+                    baseTitleList.add(titleField);
+                }
+            }
+        } else if (nextTitleList.contains("本币") || nextTitleList.contains("原币")) {
+            baseTitleList.addAll(titleList);
+        }
+
+        Row firstRow = sheet.createRow(rowNumber);
+        Row secondRow = sheet.createRow(rowNumber + 1);
+        Row thirdRow = sheet.createRow(rowNumber + 2);
+
+        int count = 0;
+        for (int index = 0; index < baseTitleList.size(); index++) {
+            Cell firstCell = firstRow.createCell(index);
+            Cell secondRowCell = secondRow.createCell(index);
+            Cell thirdCell = thirdRow.createCell(index);
+
+            //特殊处理"成本", "市值", "估值增值"字段
+            String field = baseTitleList.get(index);
+            if (EXTRA_FIELD_LIST.contains(field)) {
+                if (nextTitleList.contains("原币") || nextTitleList.contains("本币")) {
+                    firstCell.setCellValue(field);
+                    secondRowCell.setCellValue(nextTitleList.get(count));
+                    thirdCell.setCellValue(next2TitleList.get(count));
+                    count++;
+                }
+            } else {
+                firstCell.setCellValue(field);
+                secondRowCell.setCellValue(field);
+                thirdCell.setCellValue(field);
+            }
+        }
+    }
+
+
+    private void handleOneRowTitleField(List<String> titleFieldList, Sheet sheet, Integer rowNumber) {
+        Row row = sheet.createRow(rowNumber);
+        List<String> baseTitleList = Arrays.stream(titleFieldList.get(0).split(" "))
+                .filter(e -> !" ".equals(e) && StrUtil.isNotBlank(e)).collect(Collectors.toList());
+        for (int i = 0; i < baseTitleList.size(); i++) {
+            Cell cell = row.createCell(i);
+            cell.setCellValue(baseTitleList.get(i));
+        }
+    }
+
+    /**
+     * 找到字段标题行
+     *
+     * @param lines PDF表格的所有行
+     * @return 字段标题行
+     */
+    private List<String> getTitleFieldRow(List<String> lines) {
+        List<String> titleFieldList = CollUtil.newArrayList();
+        for (String line : lines) {
+            //如果当前行的第一个值是整数(意味着当前的前一行为科目代码行了),那么说明已经找完所有标题行内容
+            List<String> rowValueList = Arrays.stream(line.split(" ")).collect(Collectors.toList());
+            String firstValueOfRow = CollUtil.isNotEmpty(rowValueList) ? rowValueList.get(0) : null;
+            if (isInteger(firstValueOfRow)) {
+                break;
+            }
+            if (StrUtil.isNotBlank(line)) {
+                titleFieldList.add(line);
+            }
+        }
+        return titleFieldList;
+    }
+
+    private void mergeExcelCell(Sheet sheet, boolean isSpecial) {
+        if (sheet == null || sheet.getLastRowNum() < 4) {
+            return;
+        }
+        //先找到"科目代码"字段列在excel中的行号
+        Integer mixRowNumber = null;
+        Integer maxRowNumber = null;
+        for (int i = 0; i < sheet.getLastRowNum(); i++) {
+            Row row = sheet.getRow(i);
+            //找第一个"科目代码"出现的行数
+            if ("科目代码".equals(row.getCell(0).toString())) {
+                mixRowNumber = mixRowNumber != null ? mixRowNumber : i;
+                mixRowNumber = isSpecial ? mixRowNumber + 1 : mixRowNumber;
+                isSpecial = false;
+            }
+            //找最后一个"科目代码"出现的行数
+            if (mixRowNumber != null && "科目代码".equals(row.getCell(0).toString()) && i == sheet.getLastRowNum() - 1) {
+                maxRowNumber = i + 1;
+                break;
+            }
+        }
+
+        //参数分别为:起始行号、结束行号、起始列号、结束列号
+        if (mixRowNumber != null && maxRowNumber != null && !mixRowNumber.equals(maxRowNumber)) {
+            Row firstRow = sheet.getRow(mixRowNumber);
+            Row lastRow = sheet.getRow(maxRowNumber);
+            int rowNum = firstRow.getLastCellNum();
+            for (int i = 0; i < rowNum; i++) {
+                //不同行,同一列的上下单元格合并
+                if (firstRow.getCell(i).toString().equals(lastRow.getCell(i).toString())) {
+                    sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, maxRowNumber, i, i));
+                }
+                //同一行的左右两个单元格合并
+                if (i + 1 < rowNum && firstRow.getCell(i).toString().equals(firstRow.getCell(i + 1).toString())) {
+                    sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, mixRowNumber, i, i + 1));
+                }
+            }
+        }
+
+    }
+
+    public boolean isInteger(String str) {
+        if (StrUtil.isBlank(str)) {
+            return false;
+        }
+        return str.matches("^-?\\d+$");
+    }
+
+    public File saveFile(MultipartFile file, String excelUploadDir, String newExcelName) {
+        File targetFile = null;
+        try {
+            targetFile = new File(excelUploadDir, newExcelName);
+            FileUtils.writeByteArrayToFile(targetFile, file.getBytes());
+        } catch (IOException e) {
+            e.printStackTrace();
+            log.error("save file error,error -> {}", e.getMessage());
+            return targetFile;
+        }
+        return targetFile;
+    }
+
+    public File saveFile(File file, String excelUploadDir, String newExcelName) {
+        File targetFile = null;
+        try {
+            targetFile = new File(excelUploadDir, newExcelName);
+            FileUtils.writeByteArrayToFile(targetFile, Files.readAllBytes(file.toPath()));
+        } catch (IOException e) {
+            e.printStackTrace();
+            log.error("save file error,error -> {}", e.getMessage());
+            return targetFile;
+        }
+        return targetFile;
+    }
+
+}

+ 27 - 3
service-daq/src/main/java/com/simuwang/daq/service/ValuationEmailParser.java

@@ -2,11 +2,13 @@ package com.simuwang.daq.service;
 
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.common.conts.EmailTypeConst;
 import com.simuwang.base.common.util.ExcelUtil;
 import com.simuwang.base.pojo.dto.EmailContentInfoDTO;
 import com.simuwang.base.pojo.dto.EmailFundNavDTO;
+import com.simuwang.base.pojo.dto.ValuationPdfTransformToExcelDTO;
 import com.simuwang.base.pojo.valuation.AssetsValuationResult;
 import com.simuwang.base.pojo.valuation.ParseValuationInfo;
 import com.simuwang.base.pojo.valuation.ValuationNeedParseParam;
@@ -28,9 +30,11 @@ import java.util.stream.Collectors;
 public class ValuationEmailParser extends AbstractEmailParser {
 
     private final ValuationParseService valuationParseService;
+    private final PdfToExcelService pdfToExcelService;
 
-    public ValuationEmailParser(ValuationParseService valuationParseService) {
+    public ValuationEmailParser(ValuationParseService valuationParseService, PdfToExcelService pdfToExcelService) {
         this.valuationParseService = valuationParseService;
+        this.pdfToExcelService = pdfToExcelService;
     }
 
     @Override
@@ -41,8 +45,9 @@ public class ValuationEmailParser extends AbstractEmailParser {
     @Override
     public List<EmailFundNavDTO> parse(EmailContentInfoDTO emailContentInfoDTO, Map<String, List<String>> emailFieldMap) {
         List<EmailFundNavDTO> emailFundNavDTOList = CollUtil.newArrayList();
-        if (emailContentInfoDTO == null || StrUtil.isBlank(emailContentInfoDTO.getFilePath())
-                || !ExcelUtil.isExcel(emailContentInfoDTO.getFileName())) {
+        boolean isSatisfiedParse  = emailContentInfoDTO != null && StrUtil.isNotBlank(emailContentInfoDTO.getFilePath())
+                && (ExcelUtil.isExcel(emailContentInfoDTO.getFileName()) || ExcelUtil.isPdf(emailContentInfoDTO.getFileName()));
+        if (!isSatisfiedParse) {
             return emailFundNavDTOList;
         }
         List<ValuationNeedParseParam> valuationNeedParseParams = buildValuationNeedParseParam(emailContentInfoDTO);
@@ -56,6 +61,7 @@ public class ValuationEmailParser extends AbstractEmailParser {
     }
 
     private List<ValuationNeedParseParam> buildValuationNeedParseParam(EmailContentInfoDTO emailContentInfoDTO) {
+        // pdf格式文件转成excel
         File file = new File(emailContentInfoDTO.getFilePath());
         ValuationNeedParseParam parseParam = new ValuationNeedParseParam();
         parseParam.setFile(file);
@@ -63,9 +69,27 @@ public class ValuationEmailParser extends AbstractEmailParser {
         parseParam.setOriginFileName(emailContentInfoDTO.getFileName());
         parseParam.setFundId(null);
         parseParam.setFromEmail(1);
+        transformPdfToExcel(parseParam);
         return ListUtil.toList(parseParam);
     }
 
+    private void transformPdfToExcel(ValuationNeedParseParam valuationNeedParseParam) {
+        String originFileName = valuationNeedParseParam.getOriginFileName();
+        if (StrUtil.isNotBlank(originFileName) && !ExcelUtil.isPdf(originFileName)) {
+            return;
+        }
+        File file = valuationNeedParseParam.getFile();
+        valuationNeedParseParam.setFileUrl(file.getAbsolutePath());
+        //将pdf文件转为excel
+        List<ValuationPdfTransformToExcelDTO> toExcelDTOList = pdfToExcelService.pdfToExcel(ListUtil.toList(file));
+        Map<String, ValuationPdfTransformToExcelDTO> pdfToExcelMap = MapUtil.newHashMap();
+        if (CollUtil.isNotEmpty(toExcelDTOList)) {
+            pdfToExcelMap = toExcelDTOList.stream().collect(Collectors.toMap(ValuationPdfTransformToExcelDTO::getOriginalFileName, v -> v));
+        }
+        ValuationPdfTransformToExcelDTO toExcelDTO = pdfToExcelMap.get(originFileName);
+        valuationNeedParseParam.setFile(toExcelDTO != null ? toExcelDTO.getExcelFile() : null);
+    }
+
     private EmailFundNavDTO convertToFundNavDTO(List<AssetsValuationResult.Record> parseSucessList) {
         if (CollUtil.isEmpty(parseSucessList)) {
             return null;

+ 3 - 3
service-deploy/src/main/test/java/com/simuwang/datadaq/DataTrusteeApplicationTests.java

@@ -36,8 +36,8 @@ class DataTrusteeApplicationTests {
         emailInfoDTO.setProtocol("imap");
         Map<Integer, List<String>> emailTypeMap = MapUtil.newHashMap();
         emailTypeMap.put(1, List.of("净值"));
-        Date startDate = DateUtil.parse("2024-09-14 09:10:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2024-09-14 10:00:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date startDate = DateUtil.parse("2024-09-14 10:10:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-09-14 12:00:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
         } catch (Exception e) {
@@ -47,6 +47,6 @@ class DataTrusteeApplicationTests {
 
     @Test
     public void testReparseEmail() {
-        emailParseApiService.reparseEmail(6);
+        emailParseApiService.reparseEmail(7);
     }
 }