package com.walker.openocr.util; import com.walker.infrastructure.utils.StringUtils; import com.walker.openocr.Constants; import com.walker.openocr.TextBlock; import com.walker.openocr.table.CellObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class TableObjectUtils { private static final transient Logger logger = LoggerFactory.getLogger(TableObjectUtils.class); /** * 给定文本集合中,是否包含应该排除的关键词。 *
     *     1) 例如:标题中不能包含:中油联合,如果包含了则返回true
     * 
* @param dataList * @param tableKeysNot * @return * @date 2023-11-01 */ public static final boolean containTableKeyNot(List dataList, List tableKeysNot){ for(TextBlock textBlock : dataList){ for(String notKey : tableKeysNot){ if(textBlock.getText().indexOf(notKey) >= 0){ return true; } } } return false; } /** * 格式化识别文本,把必要的中文符号转成英文。 * @param text * @return */ public static final String formatText(String text){ text = text.replaceAll(Constants.LEFT_PARENTHESES, Constants.LEFT_PARENTHESES_EN); text = text.replaceAll(Constants.RIGHT_PARENTHESES, Constants.RIGHT_PARENTHESES_EN); text = text.replaceAll(Constants.COLON_ZH, Constants.COLON_EN); return text.trim(); } /** * 两个单元格是否在一行 * @param source * @param target * @param cellTolerance 位置像素容差值 * @return */ public static final boolean isInSameRow(CellObject source, CellObject target, int cellTolerance){ // 用开始的y坐标比较一次 float sourceY = source.getSource().getStartPosition()[1]; float targetY = target.getSource().getStartPosition()[1]; if((Math.abs(sourceY - targetY)) <= cellTolerance){ return true; } // 用结束的y坐标再比较一次 sourceY = source.getSource().getEndPosition()[1]; targetY = target.getSource().getEndPosition()[1]; if((Math.abs(sourceY - targetY)) <= cellTolerance){ return true; } return false; } public static final void sortColumnCellList(List list){ int columnSort = 1; CellObject min = findMinColumnCellObject(list, null); min.setOrderColumn(columnSort); CellObject currentMin = min; for(CellObject cellObject : list){ columnSort++; currentMin = findMinColumnCellObject(list, currentMin); if(currentMin != null){ currentMin.setOrderColumn(columnSort); } } Collections.sort(list); } private static CellObject findMinColumnCellObject(List list, CellObject minCell){ CellObject find = null; for(CellObject co : list){ // 已经比较过的单元格忽略 if(minCell != null && co.getSource().getStartPosition()[0] <= minCell.getSource().getStartPosition()[0]){ continue; } if(find == null){ find = co; continue; } if(co.getSource().getStartPosition()[0] < find.getSource().getStartPosition()[0]){ find = co; } } return find; } /** * 找到当前单元格下一行单元格集合 * @param current 当前单元格(已经匹配配置项) * @param others 给定集合 * @param lineHeight 定义的每行高度 * @return */ public static List findNextRowInfo(CellObject current , List others, int lineHeight, int cellTolerance){ if(others == null || others.size() == 0){ return null; } List resultList = new ArrayList<>(4); float currentY = current.getSource().getStartPosition()[1]; float nextRowY = 0; float distance = 0; for(CellObject co : others){ nextRowY = co.getSource().getStartPosition()[1]; distance = Math.abs(currentY - nextRowY); if((distance + cellTolerance) >= lineHeight && distance <= 2*lineHeight){ logger.debug("distance=" + distance + ", lineHeight=" + lineHeight + ", text=" + co.getSource().getText()); resultList.add(co); } } return resultList; } /** * 搜索下一行某个确定单元格(列),如:五羊-本田WH125T-9A两 / 轮摩托车 * @param current 该单元格不一定是匹配过的,也可能仅是一个存在值的单元格。 * @param others * @param lineHeight * @param cellTolerance * @return 精确返回一个单元格 */ public static final CellObject findNextRowCell(CellObject current , List others, int lineHeight, int cellTolerance){ List rowCellList = findNextRowInfo(current, others, lineHeight, cellTolerance); if(rowCellList == null || rowCellList.size() == 0){ return null; } if(current.isConfigurable()){ // 标题单元格,一般值列在右侧,而且是多行,返回最靠上一行(第二行不管) float currentX = current.getSource().getEndPosition()[0]; float currentY = current.getSource().getEndPosition()[1]; float targetX = 0; CellObject topCell = null; for(CellObject co : rowCellList){ targetX = co.getSource().getStartPosition()[0]; if(targetX < currentX){ // 左侧的不管 continue; } if(co.getSource().getStartPosition()[1] <= currentY){ logger.debug("标题格 右侧 找到多行值单元格靠上一个:" + co.getSource().getText()); return co; } } } else { // 查找普通值单元格的下一行(或上一行),因为点多行时值为多行,标题列为单行,会错位。 float currentX = current.getSource().getStartPosition()[0]; float targetX = 0; for(CellObject co : rowCellList){ targetX = co.getSource().getStartPosition()[0]; if(Math.abs(currentX - targetX) <= cellTolerance){ logger.debug("找到匹配的下一行单元格:" + co.getSource().getText()); return co; } } } return null; } public static final String parseSplitTitleAndValue(CellObject cellObject, boolean splitTitleValue){ String text = cellObject.getSource().getText(); if( splitTitleValue && text.indexOf(Constants.COLON_EN) > 0){ String[] titleValue = text.split(Constants.COLON_EN); if(titleValue.length != 2){ logger.error("内容通过:分隔为空=" + text); // return null; return StringUtils.EMPTY_STRING; } return titleValue[1]; } // 没有冒号分隔,需要去掉标题就是值 if(cellObject.isConfigurable()){ // 对于配置的单元格,需要处理标题和值粘一块的情况 // 标题匹配的开始索引值 // 如:name=车辆信核定载客, value=车辆信 int titleIndexStart = text.indexOf(cellObject.getCellConfigItem().getName()); String perhapsValue = text.replaceFirst(cellObject.getCellConfigItem().getName(), ""); if(titleIndexStart > 0 && perhapsValue.length() == titleIndexStart){ logger.debug("未检索到值:匹配的标题在后面,说明值可能不正确:" + text); // return null; return StringUtils.EMPTY_STRING; } if(perhapsValue.length() >= cellObject.getMinValueSize()){ logger.debug("在标题格找到可能的值:" + perhapsValue + ", title=" + cellObject.getCellConfigItem().getName()); return perhapsValue; } // return null; return StringUtils.EMPTY_STRING; } // 如果单元格是非配置(普通值) return text; } /** * 解析整行单元格值数据,可能包含两行。不包含:多行数据(单独方法处理) * @param rowData 标题行(有多列) * @param current 当前标题单元格 * @param nextLineCells 下一行信息 * @return */ public static final String parseFullRowTwoLineValue(List rowData , CellObject current, List nextLineCells){ String result = parseSplitTitleAndValue(current, false); StringBuilder sb = new StringBuilder(result); if(rowData != null){ sb.append(";"); for(CellObject co : rowData){ if(!co.isConfigurable() && co.getOrderColumn() > 0){ sb.append(co.getSource().getText()).append(";"); } } } // 加上下一行信息 if(nextLineCells != null){ for(CellObject co : nextLineCells){ if(!co.isConfigurable()){ sb.append(co.getSource().getText()).append(";"); } } } return sb.toString(); } /** * 解析两行文本块对应的值。 * @param currentValue 给定当前值单元格 * @param nextLineValue 找到的下一行同列单元格值 * @return */ public static final String parseCellTwoLineValue(CellObject currentValue, CellObject nextLineValue, boolean splitTitleValue){ if(nextLineValue == null){ return parseSplitTitleAndValue(currentValue, splitTitleValue); } StringBuilder sb = new StringBuilder(); if(currentValue.getSource().getStartPosition()[1] < nextLineValue.getSource().getStartPosition()[1]){ sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue)); sb.append(nextLineValue.getSource().getText()); } else { sb.append(nextLineValue.getSource().getText()); sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue)); } // String result = parseSplitTitleAndValue(currentValue); // if(nextLineValue != null){ // sb.append(";"); // sb.append(nextLineValue.getSource().getText()); // } return sb.toString(); } }