|
@@ -14,10 +14,7 @@ import com.simuwang.daq.components.report.parser.AbstractReportParser;
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.Loader;
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
-import technology.tabula.ObjectExtractor;
|
|
|
|
-import technology.tabula.Page;
|
|
|
|
-import technology.tabula.PageIterator;
|
|
|
|
-import technology.tabula.Table;
|
|
|
|
|
|
+import technology.tabula.*;
|
|
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
@@ -42,7 +39,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
// 解析报告名称和表格
|
|
// 解析报告名称和表格
|
|
String reportName = null;
|
|
String reportName = null;
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
- CustomPDFTextStripper stripper = new CustomPDFTextStripper();
|
|
|
|
|
|
+ CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 0);
|
|
stripper.setSortByPosition(true);
|
|
stripper.setSortByPosition(true);
|
|
String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
|
|
String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
|
|
List<String> textList = StrUtil.split(text, System.lineSeparator());
|
|
List<String> textList = StrUtil.split(text, System.lineSeparator());
|
|
@@ -55,7 +52,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
}
|
|
}
|
|
// 解析所有表格
|
|
// 解析所有表格
|
|
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
- PageIterator pageIterator = new ObjectExtractor(document).extract();
|
|
|
|
|
|
+ PageIterator pageIterator = new CustomObjectExtractor(document).extract();
|
|
while (pageIterator.hasNext()) {
|
|
while (pageIterator.hasNext()) {
|
|
Page page = pageIterator.next();
|
|
Page page = pageIterator.next();
|
|
List<Table> tables = extractionAlgorithm.extract(page);
|
|
List<Table> tables = extractionAlgorithm.extract(page);
|