|
@@ -4,35 +4,39 @@ import cn.hutool.core.exceptions.ExceptionUtil;
|
|
|
import cn.hutool.core.io.IORuntimeException;
|
|
|
import cn.hutool.core.map.MapUtil;
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
-import cn.hutool.http.HttpUtil;
|
|
|
import cn.hutool.json.JSONObject;
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
-import com.smppw.modaq.common.enums.ReportMonthlyType;
|
|
|
+import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
|
|
|
+import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
|
|
|
+import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
|
|
|
+import com.alibaba.dashscope.common.MultiModalMessage;
|
|
|
+import com.alibaba.dashscope.common.Role;
|
|
|
+import com.alibaba.dashscope.exception.NoApiKeyException;
|
|
|
+import com.alibaba.dashscope.exception.UploadFileException;
|
|
|
import com.smppw.modaq.common.enums.ReportParseStatus;
|
|
|
import com.smppw.modaq.common.exception.ReportParseException;
|
|
|
import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData;
|
|
|
-import com.smppw.modaq.domain.dto.report.ocr.OCRParseData;
|
|
|
import org.slf4j.Logger;
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
+import java.util.Arrays;
|
|
|
+import java.util.Collections;
|
|
|
import java.util.Map;
|
|
|
-import java.util.Objects;
|
|
|
|
|
|
public class OCRReportParser {
|
|
|
private final Logger logger = LoggerFactory.getLogger(this.getClass());
|
|
|
|
|
|
- public OCRLetterParseData parseLetterData(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
|
|
|
- Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
- paramsMap.put("image_url", ocrImgUrl);
|
|
|
- paramsMap.put("user_msg", """
|
|
|
+ public OCRLetterParseData parseLetterData(String filename, String ocrImgUrl) throws ReportParseException {
|
|
|
+ String userMsg = """
|
|
|
请提取文件中的基金名称、产品代码、投资人姓名、证件类型、证件号码、基金账户、交易账号、业务类型、申请日期、申请金额、申请份额、确认日期、确认金额、确认份额、单位净值。
|
|
|
要求准确无误的提取上述关键信息、不要遗漏和捏造虚假信息。
|
|
|
返回数据格式以json方式输出,格式为:{"基金名称":"","产品代码":"","投资人姓名":"","证件类型":"","证件号码":"","基金账户":"","交易账号":"","业务类型":"","申请日期":"","申请金额":"","申请份额":"","确认日期":"","确认金额":"","确认份额":"","单位净值":""}
|
|
|
- """);
|
|
|
+ """;
|
|
|
OCRLetterParseData res = new OCRLetterParseData();
|
|
|
String objectStr = null;
|
|
|
try {
|
|
|
- objectStr = this.parseOcrResult(ocrApi, paramsMap);
|
|
|
+ ocrImgUrl = "/" + ocrImgUrl.replaceAll("\\\\", "/");
|
|
|
+ objectStr = this.parseOcrResult(this.call(ocrImgUrl, userMsg));
|
|
|
JSONObject jsonObject = JSONUtil.parseObj(objectStr);
|
|
|
String fundName = this.cleanData(jsonObject.getStr("基金名称"));
|
|
|
String fundCode = this.cleanData(jsonObject.getStr("产品代码"));
|
|
@@ -103,97 +107,129 @@ public class OCRReportParser {
|
|
|
throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
|
|
|
} finally {
|
|
|
if (logger.isInfoEnabled()) {
|
|
|
- this.logger.info("确认单{} OCR识别参数{},OCR识别结果:{},处理后的结果是:{}",
|
|
|
- filename, paramsMap, objectStr, res);
|
|
|
+ this.logger.info("确认单{} OCR结果:{},处理后的结果是:{}",
|
|
|
+ filename, objectStr, res);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public ReportMonthlyType parseMonthlyType(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
|
|
|
- Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
- paramsMap.put("image_url", ocrImgUrl);
|
|
|
- paramsMap.put("user_msg", """
|
|
|
- 请帮我判断报告的类型,判断依据是:如果有基金概况和净值月报则为协会版,如果有业绩曲线或者基金概况和净值月报都没有则为管理人版,都不满足是返回null。
|
|
|
- 返回数据格式以json方式输出,格式为:{"报告类型":""}
|
|
|
- """);
|
|
|
- ReportMonthlyType res = ReportMonthlyType.FAILED;
|
|
|
- String objectStr = null;
|
|
|
- try {
|
|
|
- objectStr = this.parseOcrResult(ocrApi, paramsMap);
|
|
|
- JSONObject jsonObject = JSONUtil.parseObj(objectStr);
|
|
|
- String type = this.cleanData(jsonObject.getStr("报告类型"));
|
|
|
- if (StrUtil.isNotBlank(type) && Objects.equals("协会版", type)) {
|
|
|
- res = ReportMonthlyType.AMAC;
|
|
|
- } else if (StrUtil.isNotBlank(type) && Objects.equals("管理人版", type)) {
|
|
|
- res = ReportMonthlyType.MANAGER;
|
|
|
- }
|
|
|
- return res;
|
|
|
- } catch (IORuntimeException e) {
|
|
|
- this.logger.warn("报告{} 解析出错:{}", filename, ReportParseStatus.AI_NOT_FOUND.getMsg());
|
|
|
- throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
|
|
|
- } catch (Exception e) {
|
|
|
- this.logger.warn("报告{} OCR提取月报类型错误:{}", filename, ExceptionUtil.stacktraceToString(e));
|
|
|
- throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
|
|
|
- } finally {
|
|
|
- if (logger.isInfoEnabled()) {
|
|
|
- this.logger.info("报告{} OCR提取月报类型参数{},OCR提取月报类型结果:{},处理后的结果是:{}",
|
|
|
- filename, paramsMap, objectStr, res);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+// public ReportMonthlyType parseMonthlyType(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
|
|
|
+// Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
+// paramsMap.put("image_url", ocrImgUrl);
|
|
|
+// paramsMap.put("user_msg", """
|
|
|
+// 请帮我判断报告的类型,判断依据是:如果有基金概况和净值月报则为协会版,如果有业绩曲线或者基金概况和净值月报都没有则为管理人版,都不满足是返回null。
|
|
|
+// 返回数据格式以json方式输出,格式为:{"报告类型":""}
|
|
|
+// """);
|
|
|
+// ReportMonthlyType res = ReportMonthlyType.FAILED;
|
|
|
+// String objectStr = null;
|
|
|
+// try {
|
|
|
+// objectStr = this.parseOcrResult(ocrApi, paramsMap);
|
|
|
+// JSONObject jsonObject = JSONUtil.parseObj(objectStr);
|
|
|
+// String type = this.cleanData(jsonObject.getStr("报告类型"));
|
|
|
+// if (StrUtil.isNotBlank(type) && Objects.equals("协会版", type)) {
|
|
|
+// res = ReportMonthlyType.AMAC;
|
|
|
+// } else if (StrUtil.isNotBlank(type) && Objects.equals("管理人版", type)) {
|
|
|
+// res = ReportMonthlyType.MANAGER;
|
|
|
+// }
|
|
|
+// return res;
|
|
|
+// } catch (IORuntimeException e) {
|
|
|
+// this.logger.warn("报告{} 解析出错:{}", filename, ReportParseStatus.AI_NOT_FOUND.getMsg());
|
|
|
+// throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
|
|
|
+// } catch (Exception e) {
|
|
|
+// this.logger.warn("报告{} OCR提取月报类型错误:{}", filename, ExceptionUtil.stacktraceToString(e));
|
|
|
+// throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
|
|
|
+// } finally {
|
|
|
+// if (logger.isInfoEnabled()) {
|
|
|
+// this.logger.info("报告{} OCR提取月报类型参数{},OCR提取月报类型结果:{},处理后的结果是:{}",
|
|
|
+// filename, paramsMap, objectStr, res);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// public OCRParseData parse(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
|
|
|
+// Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
+// paramsMap.put("image_url", ocrImgUrl);
|
|
|
+// paramsMap.put("user_msg", """
|
|
|
+// 请提取文件中的基金名称、基金公司、产品代码,并判断是否有红色印章和是否有电话。
|
|
|
+// 要求准确无误的提取上述关键信息、不要遗漏和捏造虚假信息。
|
|
|
+// 返回数据格式以json方式输出,格式为:{"基金名称":"","基金公司":"","产品代码":"","是否有红色印章":"","是否有电话":""}
|
|
|
+// """);
|
|
|
+// OCRParseData res = new OCRParseData();
|
|
|
+// String objectStr = null;
|
|
|
+// try {
|
|
|
+// objectStr = this.parseOcrResult(ocrApi, paramsMap);
|
|
|
+// JSONObject jsonObject = JSONUtil.parseObj(objectStr);
|
|
|
+// String fundName = this.cleanData(jsonObject.getStr("基金名称"));
|
|
|
+// String fundCode = this.cleanData(jsonObject.getStr("产品代码"));
|
|
|
+// String companyName = ReportParseUtils.cleaningValue(jsonObject.getStr("基金公司"));
|
|
|
+// String seals = this.cleanData(jsonObject.getStr("是否有红色印章"));
|
|
|
+// String phone = this.cleanData(jsonObject.getStr("是否有电话"));
|
|
|
+// if (StrUtil.isNotBlank(fundName) && (fundName.contains("基金") || fundName.contains("资产管理")) && !fundName.contains("公司")) {
|
|
|
+// res.setFundName(fundName);
|
|
|
+// }
|
|
|
+// if (StrUtil.isNotBlank(companyName) && companyName.contains("有限公司")) {
|
|
|
+// res.setCompanyName(StrUtil.subBefore(companyName, "有限公司", true) + "有限公司");
|
|
|
+// }
|
|
|
+// if (StrUtil.isNotBlank(fundCode)) {
|
|
|
+// res.setFundCode(ReportParseUtils.matchFundCode(fundCode));
|
|
|
+// }
|
|
|
+// if (StrUtil.isNotBlank(seals)) {
|
|
|
+// res.setWithSeals(true);
|
|
|
+// }
|
|
|
+// if (StrUtil.isNotBlank(phone)) {
|
|
|
+// res.setWithContacts(true);
|
|
|
+// }
|
|
|
+// return res;
|
|
|
+// } catch (IORuntimeException e) {
|
|
|
+// this.logger.warn("报告{} 解析错误:{}", filename, ReportParseStatus.AI_NOT_FOUND.getMsg());
|
|
|
+// throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
|
|
|
+// } catch (Exception e) {
|
|
|
+// this.logger.warn("报告{} OCR识别错误:{}", filename, ExceptionUtil.stacktraceToString(e));
|
|
|
+// throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
|
|
|
+// } finally {
|
|
|
+// if (logger.isInfoEnabled()) {
|
|
|
+// this.logger.info("报告{} OCR识别参数{},OCR识别结果:{},处理后的结果是:{}",
|
|
|
+// filename, paramsMap, objectStr, res);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
|
|
|
- public OCRParseData parse(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
|
|
|
- Map<String, Object> paramsMap = MapUtil.newHashMap(4);
|
|
|
- paramsMap.put("image_url", ocrImgUrl);
|
|
|
- paramsMap.put("user_msg", """
|
|
|
- 请提取文件中的基金名称、基金公司、产品代码,并判断是否有红色印章和是否有电话。
|
|
|
- 要求准确无误的提取上述关键信息、不要遗漏和捏造虚假信息。
|
|
|
- 返回数据格式以json方式输出,格式为:{"基金名称":"","基金公司":"","产品代码":"","是否有红色印章":"","是否有电话":""}
|
|
|
- """);
|
|
|
- OCRParseData res = new OCRParseData();
|
|
|
- String objectStr = null;
|
|
|
- try {
|
|
|
- objectStr = this.parseOcrResult(ocrApi, paramsMap);
|
|
|
- JSONObject jsonObject = JSONUtil.parseObj(objectStr);
|
|
|
- String fundName = this.cleanData(jsonObject.getStr("基金名称"));
|
|
|
- String fundCode = this.cleanData(jsonObject.getStr("产品代码"));
|
|
|
- String companyName = ReportParseUtils.cleaningValue(jsonObject.getStr("基金公司"));
|
|
|
- String seals = this.cleanData(jsonObject.getStr("是否有红色印章"));
|
|
|
- String phone = this.cleanData(jsonObject.getStr("是否有电话"));
|
|
|
- if (StrUtil.isNotBlank(fundName) && (fundName.contains("基金") || fundName.contains("资产管理")) && !fundName.contains("公司")) {
|
|
|
- res.setFundName(fundName);
|
|
|
- }
|
|
|
- if (StrUtil.isNotBlank(companyName) && companyName.contains("有限公司")) {
|
|
|
- res.setCompanyName(StrUtil.subBefore(companyName, "有限公司", true) + "有限公司");
|
|
|
- }
|
|
|
- if (StrUtil.isNotBlank(fundCode)) {
|
|
|
- res.setFundCode(ReportParseUtils.matchFundCode(fundCode));
|
|
|
- }
|
|
|
- if (StrUtil.isNotBlank(seals)) {
|
|
|
- res.setWithSeals(true);
|
|
|
- }
|
|
|
- if (StrUtil.isNotBlank(phone)) {
|
|
|
- res.setWithContacts(true);
|
|
|
- }
|
|
|
- return res;
|
|
|
- } catch (IORuntimeException e) {
|
|
|
- this.logger.warn("报告{} 解析错误:{}", filename, ReportParseStatus.AI_NOT_FOUND.getMsg());
|
|
|
- throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
|
|
|
- } catch (Exception e) {
|
|
|
- this.logger.warn("报告{} OCR识别错误:{}", filename, ExceptionUtil.stacktraceToString(e));
|
|
|
- throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
|
|
|
- } finally {
|
|
|
- if (logger.isInfoEnabled()) {
|
|
|
- this.logger.info("报告{} OCR识别参数{},OCR识别结果:{},处理后的结果是:{}",
|
|
|
- filename, paramsMap, objectStr, res);
|
|
|
- }
|
|
|
+ private Object call(String localPath, String userMsg) throws NoApiKeyException, UploadFileException {
|
|
|
+ String filePath = "file://" + localPath;
|
|
|
+ MultiModalConversation conv = new MultiModalConversation();
|
|
|
+ Map<String, Object> map = MapUtil.newHashMap();
|
|
|
+ map.put("image", filePath);
|
|
|
+ // 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
|
|
|
+ map.put("max_pixels", "6422528");
|
|
|
+ // 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
|
|
|
+ map.put("min_pixels", "3136");
|
|
|
+ // 开启图像自动转正功能
|
|
|
+ map.put("enable_rotate", true);
|
|
|
+ MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
|
|
|
+ .content(Arrays.asList(
|
|
|
+ map,
|
|
|
+ // qwen-vl-ocr-latest未设置内置任务时,支持在以下text字段中传入Prompt,若未传入则使用默认的Prompt:Please output only the text content from the image without any additional descriptions or formatting.
|
|
|
+ // 如调用qwen-vl-ocr-1028,模型会使用固定Prompt:Read all the text in the image.,不支持用户在text中传入自定义Prompt
|
|
|
+ Collections.singletonMap("text", userMsg))).build();
|
|
|
+ String dashscopeApiKey = System.getenv("DASHSCOPE_API_KEY");
|
|
|
+ if (StrUtil.isBlank(dashscopeApiKey)) {
|
|
|
+ dashscopeApiKey = "sk-7f1caa54f94047db91d4e36f7ee811c5";
|
|
|
}
|
|
|
+ MultiModalConversationParam param = MultiModalConversationParam.builder()
|
|
|
+ // 若没有配置环境变量,请用百炼API Key将下行替换为:.apiKey("sk-xxx")
|
|
|
+ .apiKey(dashscopeApiKey)
|
|
|
+ .model("qwen-vl-ocr-latest")
|
|
|
+ .message(userMessage)
|
|
|
+ .topP(0.001)
|
|
|
+ .temperature(0.1f)
|
|
|
+ .maxLength(8192)
|
|
|
+ .build();
|
|
|
+ MultiModalConversationResult result = conv.call(param);
|
|
|
+ return result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text");
|
|
|
}
|
|
|
|
|
|
- private String parseOcrResult(String ocrApi, Map<String, Object> paramsMap) {
|
|
|
- String body = HttpUtil.get(ocrApi, paramsMap);
|
|
|
- JSONObject jsonResult = JSONUtil.parseObj(body);
|
|
|
- String content = StrUtil.split(jsonResult.getStr("content"), "```").get(1);
|
|
|
+ private String parseOcrResult(Object context) {
|
|
|
+ String content = StrUtil.split(context.toString(), "```").get(1);
|
|
|
return "{" + StrUtil.subAfter(content, "{", false) + "}";
|
|
|
}
|
|
|
|
|
@@ -208,14 +244,6 @@ public class OCRReportParser {
|
|
|
if ("无".equals(trim) || "否".equals(trim)) {
|
|
|
return null;
|
|
|
}
|
|
|
- String value = ReportParseUtils.cleaningValue(trim);
|
|
|
- if (value == null) {
|
|
|
- return null;
|
|
|
- }
|
|
|
- // 识别到多个基金
|
|
|
- if (value.contains("、") || value.contains(",")) {
|
|
|
- return value.replaceAll("[、,]", ",");
|
|
|
- }
|
|
|
- return value;
|
|
|
+ return trim;
|
|
|
}
|
|
|
}
|