package com.walker.openocr.table; import com.walker.openocr.RecognizeResult; import com.walker.openocr.TextBlock; import com.walker.openocr.util.TableConfigUtils; import com.walker.openocr.util.TableObjectUtils; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; public class TableObject extends RecognizeResult { // protected final transient Logger logger = LoggerFactory.getLogger(this.getClass()); private TableConfig tableConfig = null; // private Map> rowCache = new TreeMap<>(); // 其他还未排序的单元格集合,包含:需要的值以及无用的单元格 protected List otherCellObjectList = new ArrayList<>(); // 配置定义 与 文本快 对应关系,因为存在多个文本匹配同一个配置情况,所以需要再次比较最匹配的。 // 如:配置项(死亡伤残赔偿限额) // “死亡伤残赔偿限额180000元” 和 “无责任死亡伤残赔偿限额18000元” 都匹配该项 private Map configItemBlockCache = new HashMap<>(); public TableObject(TableConfig tableConfig){ if(tableConfig == null){ throw new IllegalArgumentException("TableConfig is required!"); } this.tableConfig = tableConfig; } @Override public TableConfig getRecognizeConfig(){ if(this.tableConfig == null){ logger.error("TableConfig 不存在!"); return null; } return this.tableConfig; } /** * 返回解析的表格数据结果,map中: * key = 业务配置的ID,value = 单元格对象 * @return * @date 2022-09-02 */ public Map getTableDataMap(){ Map map = new HashMap<>(); if(this.rowCache.size() == 0){ return map; } for (List list : this.rowCache.values()){ for(CellObject co : list){ if(co.isConfigurable()){ map.put(co.getId(), co); } } } return map; } /** * 计算标题对应的值。 */ @Override public void calculateValue(){ int cellSize = 0; for(List rowData : this.rowCache.values()){ // 每行处理 cellSize = rowData.size(); CellObject current = null; String cellValue = null; for(int i=0; i nextLineCells = this.acquireTwoLineCellInfo(current); cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, nextLineCells); } else if(current.isConfigSingleLine()){ cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, null); } else if(current.isConfigMoreLine()){ throw new UnsupportedOperationException("未实现完整行(多行文本)代码"); } } else if(current.isConfigTwoLine()) { // 普通单元格:确定两行 if(this.isExistValueCell(i, rowData)){ CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i+1)); if(nextLineCell != null){ logger.debug("找到第二行值:" + nextLineCell.getSource().getText()); cellValue = TableObjectUtils.parseCellTwoLineValue(rowData.get(i+1), nextLineCell, false); } } else { // 未找到标题后面的值单元格,很可能由于不是同一行而忽略,此时需要从备选格中查找最近的行 CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i)); if(nextLineCell != null){ logger.debug("找到第二行值2:" + nextLineCell.getSource().getText()); cellValue = TableObjectUtils.parseCellTwoLineValue(nextLineCell, null, false); } } } else if (current.isConfigSingleLine()) { // 先解析看是否存在标题与值粘连的情况 cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true); if(cellValue == null){ // 普通单元格:单行 if(this.isExistValueCell(i, rowData)){ // 存在值单元格 cellValue = rowData.get(i+1).getSource().getText(); } else { // 没有值单元格 cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true); } } } else { throw new UnsupportedOperationException("未实现代码,不确定的单元格:" + current.getSource().getText()); } current.setValue(cellValue); } } } } private boolean isExistValueCell(int currentIndex, List rowData){ if(currentIndex + 1 < rowData.size()){ return true; } return false; } /** * 获取两行单元格(值单元格) * @param cellObject 给定的当前单元格,可以是:配置的标题格,也可以是已确定的第一行值单元格 * @return */ protected CellObject acquireTwoLineCell(CellObject cellObject){ // if(cellObject.isConfigurable()){ // throw new IllegalArgumentException("acquireTwoLineCell:不能是可配置单元格"); // } return TableObjectUtils.findNextRowCell(cellObject , this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance()); } protected List acquireTwoLineCellInfo(CellObject configurableCellObject){ if(!configurableCellObject.isConfigTwoLine()){ throw new IllegalArgumentException("acquireTwoLineCellInfo:单元格不是两行配置,无法调用该方法"); } return TableObjectUtils.findNextRowInfo(configurableCellObject , this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance()); } /** * 添加完文本块后,排序,把相关行内容放在一起,并对每一行进行列排序。 */ @Override public void sortCellObjectList(){ CellObject target = null; for(Iterator it = this.otherCellObjectList.iterator(); it.hasNext();){ // for(CellObject target : this.otherCellObjectList){ target = it.next(); List rowSet = null; boolean sameRow = false; for(Map.Entry> entry : this.rowCache.entrySet()){ rowSet = entry.getValue(); sameRow = TableObjectUtils.isInSameRow(rowSet.get(0), target, tableConfig.getCellTolerance()); if(sameRow){ logger.debug("找到匹配行:" + entry.getKey() + ", target=" + target.getSource().getText()); rowSet.add(target); // 从other集合移除该单元格 it.remove(); // 退出该循环,从下一个目标元素继续执行 break; } } } // 每行对列排序 // int orderColumn = 0; for(List list : this.rowCache.values()){ TableObjectUtils.sortColumnCellList(list); } } @Override public void addTextBlock(TextBlock textBlock){ CellObject cellObject = new CellObject(); cellObject.setSource(textBlock); ConfigItem configItem = this.tableConfig.findConfigItem(textBlock.getText()); if(configItem != null){ if(configItem instanceof ColumnConfigItem){ throw new UnsupportedOperationException("未实现代码:ColumnConfigItem"); } if(configItem instanceof CellConfigItem){ CellConfigItem cellConfigItem = (CellConfigItem) configItem; cellObject.setCellConfigItem(cellConfigItem); cellObject.setId(cellConfigItem.getId()); cellObject.setRowNum(cellConfigItem.getOrderNum()); } this.putCellObjectToRowCache(cellObject, configItem); } else { cellObject.setId(this.generateCellId()); this.otherCellObjectList.add(cellObject); } } private void putCellObjectToRowCache(CellObject cellObject, ConfigItem configItem){ CellObject textBlockExist = this.configItemBlockCache.get(configItem.getId()); if(textBlockExist == null){ this.configItemBlockCache.put(configItem.getId(), cellObject); this.putOneRowCache(cellObject); } else { // 如果已经存在匹配配置项的文本块,说明有些文本也存在关键词,需要根据评分检查。 CellObject matchedCellObject = this.findMaxScoreCellObject(textBlockExist, cellObject, configItem); this.configItemBlockCache.put(configItem.getId(), matchedCellObject); this.putOneRowCache(matchedCellObject); } } private CellObject findMaxScoreCellObject(CellObject textBlockExist, CellObject cellObject, ConfigItem configItem){ float existScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), textBlockExist.getSource().getText()); float newScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), cellObject.getSource().getText()); if(newScore > existScore){ logger.debug("新文本快更加匹配配置项:" + configItem.getName() + ", newScore=" + newScore + ", text=" + cellObject.getSource().getText()); return cellObject; } else { logger.debug("已有文本快更适合:" + configItem.getName() + ", existScore=" + existScore + ", text=" + textBlockExist.getSource().getText()); return textBlockExist; } } private void putOneRowCache(CellObject cellObject){ List list = this.rowCache.get(cellObject.getRowNum()); if(list == null){ list = new LinkedList<>(); this.rowCache.put(cellObject.getRowNum(), list); } if(!list.contains(cellObject)){ list.add(cellObject); } } private String generateCellId(){ return String.valueOf(System.nanoTime()); } @Override public void printRowCache(){ // List data = null; // for(Map.Entry> entry : this.rowCache.entrySet()){ // logger.debug("第 " + entry.getKey() + " 行 =========================================="); // data = entry.getValue(); // if(data != null){ // for(CellObject co : data){ // logger.debug(co.toString()); // } // } // } super.printRowCache(); logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); for(CellObject other : this.otherCellObjectList){ logger.debug(other.toString()); } } }