Jelajahi Sumber

feat:邮件解析-支持解析邮件pdf附件

mozuwen 7 bulan lalu
induk
melakukan
bf72557534

+ 24 - 0
service-base/pom.xml

@@ -174,6 +174,30 @@
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-starter-quartz</artifactId>
         </dependency>
+
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox</artifactId>
+            <version>3.0.1</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-simple</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
+        <dependency>
+            <groupId>technology.tabula</groupId>
+            <artifactId>tabula</artifactId>
+            <version>1.0.5</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-simple</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
     </dependencies>
 
 <!--    <build>-->

+ 3 - 1
service-base/src/main/java/com/simuwang/base/common/util/ExcelUtil.java

@@ -155,7 +155,7 @@ public class ExcelUtil {
                     NumberFormat numberFormat = NumberFormat.getNumberInstance();
                     numberFormat.setMaximumFractionDigits(15);
                     double formulaResult = cell.getNumericCellValue();
-                    cellValue = numberFormat.format(formulaResult).replaceAll(",","");
+                    cellValue = numberFormat.format(formulaResult).replaceAll(",", "");
                 }
                 break;
             case BOOLEAN:
@@ -178,6 +178,8 @@ public class ExcelUtil {
                 cellValue = "";
                 break;
         }
+        // 去掉换行符号
+        cellValue = StrUtil.isNotBlank(cellValue) ? cellValue.replaceAll("[\\r\\n]+", "") : "";
         return cellValue;
     }
 }

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java

@@ -406,7 +406,7 @@ public class EmailParseService {
         // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
         SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
         Message[] messages = folder.search(startDateTerm);
-        String path = "/data/file";
+        String path = "/data/file/nav";
         Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
         for (Message message : messages) {
             List<EmailContentInfoDTO> emailContentInfoDTOList = CollUtil.newArrayList();

+ 111 - 46
service-daq/src/main/java/com/simuwang/daq/service/NavEmailParser.java

@@ -13,6 +13,8 @@ import com.simuwang.base.common.util.ExcelUtil;
 import com.simuwang.base.common.util.StringUtil;
 import com.simuwang.base.pojo.dto.EmailContentInfoDTO;
 import com.simuwang.base.pojo.dto.EmailFundNavDTO;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
@@ -25,10 +27,14 @@ import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.stereotype.Component;
+import technology.tabula.*;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -66,8 +72,14 @@ public class NavEmailParser extends AbstractEmailParser {
         // 2.解析邮件excel附件
         if (StrUtil.isNotBlank(emailContentInfoDTO.getFilePath()) && ExcelUtil.isExcel(emailContentInfoDTO.getFileName())) {
             List<EmailFundNavDTO> fundNavDTOList = parseExcelFile(emailContentInfoDTO.getFilePath(), emailFieldMap);
-            emailFundNavDTOList.addAll(fundNavDTOList);
+            Optional.ofNullable(fundNavDTOList).ifPresent(emailFundNavDTOList::addAll);
         }
+        // 3.解析邮件pdf附件
+        if (StrUtil.isNotBlank(emailContentInfoDTO.getFilePath()) && ExcelUtil.isPdf(emailContentInfoDTO.getFileName())) {
+            List<EmailFundNavDTO> fundNavDTOList = parsePdfFile(emailContentInfoDTO, emailContentInfoDTO.getFilePath(), emailFieldMap);
+            Optional.ofNullable(fundNavDTOList).ifPresent(emailFundNavDTOList::addAll);
+        }
+
         // 校验净值数据格式
         if (CollUtil.isNotEmpty(emailFundNavDTOList)) {
             emailFundNavDTOList = emailFundNavDTOList.stream().filter(super::dataFormat).collect(Collectors.toList());
@@ -76,6 +88,55 @@ public class NavEmailParser extends AbstractEmailParser {
     }
 
     /**
+     * 解析邮件pdf附件
+     *
+     * @param emailContentInfoDTO 邮件信息
+     * @param filePath            邮件excel附件地址
+     * @param emailFieldMap       邮件字段识别规则映射表
+     * @return 解析到的净值数据
+     */
+    private List<EmailFundNavDTO> parsePdfFile(EmailContentInfoDTO emailContentInfoDTO, String filePath, Map<String, List<String>> emailFieldMap) {
+        String excelFilePath = "/data/file/pdf/" + emailContentInfoDTO.getEmailAddress() + "/" + emailContentInfoDTO.getEmailDate().substring(0, 10).replaceAll("-", "")
+                + "/" + emailContentInfoDTO.getFileName().replace(".pdf", ".xlsx").replace(".PDF", ".xlsx");
+        File savefile = new File(excelFilePath);
+        if (!savefile.exists()) {
+            if (!savefile.getParentFile().exists()) {
+                savefile.getParentFile().mkdirs();
+                savefile.getParentFile().setExecutable(true);
+            }
+        }
+        try (OutputStream outputStream = Files.newOutputStream(Paths.get(excelFilePath))) {
+            PDDocument document = Loader.loadPDF(new File(filePath));
+            PageIterator extract = new ObjectExtractor(document).extract();
+            Workbook workbook = new XSSFWorkbook();
+            Sheet sheet = workbook.createSheet("Sheet1");
+            while (extract.hasNext()) {
+                Page next = extract.next();
+                List<Table> tableList = new SpreadsheetExtractionAlgorithm().extract(next);
+                for (Table table : tableList) {
+                    List<List<RectangularTextContainer>> rows = table.getRows();
+                    for (int rowNum = 0; rowNum < rows.size(); rowNum++) {
+                        Row sheetRow = sheet.createRow(rowNum);
+                        List<RectangularTextContainer> textContainerList = rows.get(rowNum);
+                        for (int cellNum = 0; cellNum < textContainerList.size(); cellNum++) {
+                            Cell cell = sheetRow.createCell(cellNum);
+                            RectangularTextContainer textContainer = textContainerList.get(cellNum);
+                            if (textContainer != null) {
+                                cell.setCellValue(textContainer.getText());
+                            }
+                        }
+                    }
+                }
+            }
+            // 将Excel工作簿写入输出流
+            workbook.write(outputStream);
+        } catch (Exception e) {
+            log.error("解析邮件pdf附件报错 -> 邮件主题:{},邮件日期:{},堆栈信息:{}", emailContentInfoDTO.getEmailTitle(), emailContentInfoDTO.getEmailDate(), ExceptionUtil.stacktraceToString(e));
+        }
+        return parseExcelFile(excelFilePath, emailFieldMap);
+    }
+
+    /**
      * 解析邮件excel附件
      *
      * @param filePath      邮件excel附件地址
@@ -95,6 +156,40 @@ public class NavEmailParser extends AbstractEmailParser {
         return parseSheetData(sheet, fieldPositionMap);
     }
 
+    /**
+     * 解析邮件正文
+     *
+     * @param emailContentInfoDTO 邮件信息
+     * @param emailContent        正文内容
+     * @param emailFieldMap       邮件字段识别规则映射表
+     * @return 解析到的净值数据
+     */
+    private List<EmailFundNavDTO> parseEmailContent(EmailContentInfoDTO emailContentInfoDTO, String emailContent, Map<String, List<String>> emailFieldMap) {
+        Document doc = Jsoup.parse(emailContent);
+        Element table = doc.select("table").first();
+        Elements rows = table.select("tr");
+        String excelFilePath = "/data/file/content/" + emailContentInfoDTO.getEmailAddress() + "/" + emailContentInfoDTO.getEmailDate().substring(0, 10).replaceAll("-", "") + "/"
+                + emailContentInfoDTO.getFileName().replace(".html", ".xlsx");
+        File savefile = new File(excelFilePath);
+        if (!savefile.exists()) {
+            if (!savefile.getParentFile().exists()) {
+                savefile.getParentFile().mkdirs();
+                savefile.getParentFile().setExecutable(true);
+            }
+        }
+        try (OutputStream outputStream = new FileOutputStream(savefile)) {
+            // 创建一个新的Excel工作簿
+            Workbook workbook = new XSSFWorkbook();
+            Sheet sheet = workbook.createSheet("Sheet1");
+            writeDataToSheet(sheet, rows);
+            // 将Excel工作簿写入输出流
+            workbook.write(outputStream);
+        } catch (Exception e) {
+            log.error("解析正文报错 -> 邮件主题:{},邮件日期:{},堆栈信息:{}", emailContentInfoDTO.getEmailTitle(), emailContentInfoDTO.getEmailDate(), ExceptionUtil.stacktraceToString(e));
+        }
+        return parseExcelFile(excelFilePath, emailFieldMap);
+    }
+
     private List<EmailFundNavDTO> parseSheetData(Sheet sheet, Map<String, Pair<Integer, Integer>> fieldPositionMap) {
         List<EmailFundNavDTO> fundNavDTOList = CollUtil.newArrayList();
         // 通过表头所在位置判断是行数据还是列数据
@@ -190,15 +285,14 @@ public class NavEmailParser extends AbstractEmailParser {
         fundNavDTO.setPriceDate(fieldValueMap.get(EmailFieldConst.PRICE_DATE));
         fundNavDTO.setNav(fieldValueMap.get(EmailFieldConst.NAV));
         fundNavDTO.setCumulativeNavWithdrawal(fieldValueMap.get(EmailFieldConst.CUMULATIVE_NAV_WITHDRAWAL));
+
+        // pdf解析到的值带有",",比如:"10,656,097.37"
         String assetNet = fieldValueMap.get(EmailFieldConst.ASSET_NET);
-        if (StrUtil.isBlank(assetNet)) {
-            assetNet = null;
-        }
+        assetNet = StrUtil.isNotBlank(assetNet) ? assetNet.replaceAll(",", "") : null;
+
         fundNavDTO.setAssetNet(assetNet);
         String assetShares = fieldValueMap.get(EmailFieldConst.ASSET_NET);
-        if (StrUtil.isBlank(assetShares)) {
-            assetShares = null;
-        }
+        assetShares = StrUtil.isNotBlank(assetShares) ? assetShares.replaceAll(",", "") : null;
         fundNavDTO.setAssetShare(assetShares);
         return fundNavDTO;
     }
@@ -233,6 +327,7 @@ public class NavEmailParser extends AbstractEmailParser {
         String fundName = columnFieldMap.get(EmailFieldConst.FUND_NAME) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.FUND_NAME)).getStringCellValue() != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.FUND_NAME))) : null;
         emailFundNavDTO.setFundName(fundName);
+
         String registerNumber = columnFieldMap.get(EmailFieldConst.REGISTER_NUMBER) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.REGISTER_NUMBER)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.REGISTER_NUMBER))) : null;
         emailFundNavDTO.setRegisterNumber(registerNumber);
@@ -243,16 +338,14 @@ public class NavEmailParser extends AbstractEmailParser {
         emailFundNavDTO.setVirtualNav(virtualNav);
         String assetNet = columnFieldMap.get(EmailFieldConst.ASSET_NET) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.ASSET_NET)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.ASSET_NET))) : null;
-        if (StrUtil.isBlank(assetNet)) {
-            assetNet = null;
-        }
+        // pdf解析到的值带有",",比如:"10,656,097.37"
+        assetNet = StrUtil.isNotBlank(assetNet) ? assetNet.replaceAll(",", "") : null;
         emailFundNavDTO.setAssetNet(assetNet);
         String assetShares = columnFieldMap.get(EmailFieldConst.ASSET_SHARE) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.ASSET_SHARE)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.ASSET_SHARE))) : null;
-        if (StrUtil.isBlank(assetShares)) {
-            assetShares = null;
-        }
+        assetShares = StrUtil.isNotBlank(assetShares) ? assetShares.replaceAll(",", "") : null;
         emailFundNavDTO.setAssetShare(assetShares);
+
         fundNavDTOList.add(emailFundNavDTO);
         return fundNavDTOList;
     }
@@ -282,17 +375,15 @@ public class NavEmailParser extends AbstractEmailParser {
         String virtualNav = columnFieldMap.get(EmailFieldConst.PARENT_VIRTUAL_NAV) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_VIRTUAL_NAV)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_VIRTUAL_NAV))) : null;
         emailFundNavDTO.setVirtualNav(virtualNav);
+
         String assetNet = columnFieldMap.get(EmailFieldConst.PARENT_ASSET_NET) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_ASSET_NET)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_ASSET_NET))) : null;
-        if (StrUtil.isBlank(assetNet) || !StrUtil.isNumeric(assetNet)) {
-            assetNet = null;
-        }
+        assetNet = StrUtil.isNotBlank(assetNet) ? assetNet.replaceAll(",", "") : null;
         emailFundNavDTO.setAssetNet(assetNet);
         String assetShares = columnFieldMap.get(EmailFieldConst.PARENT_ASSET_SHARE) != null && sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_ASSET_SHARE)) != null ?
                 ExcelUtil.getCellValue(sheetRow.getCell(columnFieldMap.get(EmailFieldConst.PARENT_ASSET_SHARE))) : null;
-        if (StrUtil.isBlank(assetShares) || !StrUtil.isNumeric(assetShares)) {
-            assetShares = null;
-        }
+        assetShares = StrUtil.isNotBlank(assetShares) ? assetShares.replaceAll(",", "") : null;
+
         emailFundNavDTO.setAssetShare(assetShares);
         return emailFundNavDTO;
     }
@@ -411,32 +502,6 @@ public class NavEmailParser extends AbstractEmailParser {
         return null;
     }
 
-    private List<EmailFundNavDTO> parseEmailContent(EmailContentInfoDTO emailContentInfoDTO, String emailContent, Map<String, List<String>> emailFieldMap) {
-        Document doc = Jsoup.parse(emailContent);
-        Element table = doc.select("table").first();
-        Elements rows = table.select("tr");
-        String excelFilePath = "/data/file/content/" + emailContentInfoDTO.getEmailDate().substring(0, 10).replaceAll("-","")
-                + "/" + emailContentInfoDTO.getFileName().replace(".html", ".xlsx");
-        File savefile = new File(excelFilePath);
-        if (!savefile.exists()) {
-            if (!savefile.getParentFile().exists()) {
-                savefile.getParentFile().mkdirs();
-                savefile.getParentFile().setExecutable(true);
-            }
-        }
-        try (OutputStream outputStream = new FileOutputStream(savefile)) {
-            // 创建一个新的Excel工作簿
-            Workbook workbook = new XSSFWorkbook();
-            Sheet sheet = workbook.createSheet("Sheet1");
-            writeDataToSheet(sheet, rows);
-            // 将Excel工作簿写入输出流
-            workbook.write(outputStream);
-        } catch (Exception e) {
-            log.error("解析正文报错 -> 堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
-        }
-        return parseExcelFile(excelFilePath, emailFieldMap);
-    }
-
     private void writeDataToSheet(Sheet sheet, Elements rows) {
         int rowSize = rows.size();
         for (int rowNum = 0; rowNum < rowSize; rowNum++) {
@@ -445,7 +510,7 @@ public class NavEmailParser extends AbstractEmailParser {
             Element elementRow = rows.get(rowNum);
             Elements cells = elementRow.select("td");
             int cellSize = cells.size();
-            for (int cellNum = 0; cellNum< cellSize; cellNum++) {
+            for (int cellNum = 0; cellNum < cellSize; cellNum++) {
                 Cell sheetRowCell = sheetRow.createCell(cellNum);
                 sheetRowCell.setCellValue(cells.get(cellNum).text());
             }

+ 3 - 3
service-deploy/src/main/test/java/com/simuwang/datadaq/DataTrusteeApplicationTests.java

@@ -1,4 +1,4 @@
-package java.com.simuwang.datadaq;
+package com.simuwang.datadaq;
 
 import cn.hutool.core.date.DateUtil;
 import cn.hutool.core.map.MapUtil;
@@ -32,8 +32,8 @@ class DataTrusteeApplicationTests {
         emailInfoDTO.setProtocol("imap");
         Map<Integer, List<String>> emailTypeMap = MapUtil.newHashMap();
         emailTypeMap.put(1, List.of("净值"));
-        Date startDate = DateUtil.parse("2024-09-10 09:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2024-09-10 10:00:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date startDate = DateUtil.parse("2024-09-10 15:30:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-09-10 18:00:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
         } catch (Exception e) {