package com.walker.openocr.util;
|
|
import com.walker.infrastructure.utils.StringUtils;
|
import com.walker.openocr.Constants;
|
import com.walker.openocr.TextBlock;
|
import com.walker.openocr.table.CellObject;
|
import org.slf4j.Logger;
|
import org.slf4j.LoggerFactory;
|
|
import java.util.ArrayList;
|
import java.util.Collections;
|
import java.util.List;
|
|
public class TableObjectUtils {
|
|
private static final transient Logger logger = LoggerFactory.getLogger(TableObjectUtils.class);
|
|
/**
|
* 给定文本集合中,是否包含应该排除的关键词。
|
* <pre>
|
* 1) 例如:标题中不能包含:中油联合,如果包含了则返回<code>true</code>
|
* </pre>
|
* @param dataList
|
* @param tableKeysNot
|
* @return
|
* @date 2023-11-01
|
*/
|
public static final boolean containTableKeyNot(List<TextBlock> dataList, List<String> tableKeysNot){
|
for(TextBlock textBlock : dataList){
|
for(String notKey : tableKeysNot){
|
if(textBlock.getText().indexOf(notKey) >= 0){
|
return true;
|
}
|
}
|
}
|
return false;
|
}
|
|
/**
|
* 格式化识别文本,把必要的中文符号转成英文。
|
* @param text
|
* @return
|
*/
|
public static final String formatText(String text){
|
text = text.replaceAll(Constants.LEFT_PARENTHESES, Constants.LEFT_PARENTHESES_EN);
|
text = text.replaceAll(Constants.RIGHT_PARENTHESES, Constants.RIGHT_PARENTHESES_EN);
|
text = text.replaceAll(Constants.COLON_ZH, Constants.COLON_EN);
|
return text.trim();
|
}
|
|
/**
|
* 两个单元格是否在一行
|
* @param source
|
* @param target
|
* @param cellTolerance 位置像素容差值
|
* @return
|
*/
|
public static final boolean isInSameRow(CellObject source, CellObject target, int cellTolerance){
|
// 用开始的y坐标比较一次
|
float sourceY = source.getSource().getStartPosition()[1];
|
float targetY = target.getSource().getStartPosition()[1];
|
if((Math.abs(sourceY - targetY)) <= cellTolerance){
|
return true;
|
}
|
|
// 用结束的y坐标再比较一次
|
sourceY = source.getSource().getEndPosition()[1];
|
targetY = target.getSource().getEndPosition()[1];
|
if((Math.abs(sourceY - targetY)) <= cellTolerance){
|
return true;
|
}
|
return false;
|
}
|
|
public static final void sortColumnCellList(List<CellObject> list){
|
int columnSort = 1;
|
CellObject min = findMinColumnCellObject(list, null);
|
min.setOrderColumn(columnSort);
|
|
CellObject currentMin = min;
|
for(CellObject cellObject : list){
|
columnSort++;
|
currentMin = findMinColumnCellObject(list, currentMin);
|
if(currentMin != null){
|
currentMin.setOrderColumn(columnSort);
|
}
|
}
|
Collections.sort(list);
|
}
|
|
private static CellObject findMinColumnCellObject(List<CellObject> list, CellObject minCell){
|
CellObject find = null;
|
for(CellObject co : list){
|
// 已经比较过的单元格忽略
|
if(minCell != null
|
&& co.getSource().getStartPosition()[0] <= minCell.getSource().getStartPosition()[0]){
|
continue;
|
}
|
if(find == null){
|
find = co;
|
continue;
|
}
|
if(co.getSource().getStartPosition()[0] < find.getSource().getStartPosition()[0]){
|
find = co;
|
}
|
}
|
return find;
|
}
|
|
/**
|
* 找到当前单元格下一行单元格集合
|
* @param current 当前单元格(已经匹配配置项)
|
* @param others 给定集合
|
* @param lineHeight 定义的每行高度
|
* @return
|
*/
|
public static List<CellObject> findNextRowInfo(CellObject current
|
, List<CellObject> others, int lineHeight, int cellTolerance){
|
if(others == null || others.size() == 0){
|
return null;
|
}
|
List<CellObject> resultList = new ArrayList<>(4);
|
float currentY = current.getSource().getStartPosition()[1];
|
float nextRowY = 0;
|
float distance = 0;
|
for(CellObject co : others){
|
nextRowY = co.getSource().getStartPosition()[1];
|
distance = Math.abs(currentY - nextRowY);
|
if((distance + cellTolerance) >= lineHeight && distance <= 2*lineHeight){
|
logger.debug("distance=" + distance + ", lineHeight=" + lineHeight + ", text=" + co.getSource().getText());
|
resultList.add(co);
|
}
|
}
|
return resultList;
|
}
|
|
/**
|
* 搜索下一行某个确定单元格(列),如:五羊-本田WH125T-9A两 / 轮摩托车
|
* @param current 该单元格不一定是匹配过的,也可能仅是一个存在值的单元格。
|
* @param others
|
* @param lineHeight
|
* @param cellTolerance
|
* @return 精确返回一个单元格
|
*/
|
public static final CellObject findNextRowCell(CellObject current
|
, List<CellObject> others, int lineHeight, int cellTolerance){
|
List<CellObject> rowCellList = findNextRowInfo(current, others, lineHeight, cellTolerance);
|
if(rowCellList == null || rowCellList.size() == 0){
|
return null;
|
}
|
if(current.isConfigurable()){
|
// 标题单元格,一般值列在右侧,而且是多行,返回最靠上一行(第二行不管)
|
float currentX = current.getSource().getEndPosition()[0];
|
float currentY = current.getSource().getEndPosition()[1];
|
float targetX = 0;
|
CellObject topCell = null;
|
for(CellObject co : rowCellList){
|
targetX = co.getSource().getStartPosition()[0];
|
if(targetX < currentX){
|
// 左侧的不管
|
continue;
|
}
|
if(co.getSource().getStartPosition()[1] <= currentY){
|
logger.debug("标题格 右侧 找到多行值单元格靠上一个:" + co.getSource().getText());
|
return co;
|
}
|
}
|
|
} else {
|
// 查找普通值单元格的下一行(或上一行),因为点多行时值为多行,标题列为单行,会错位。
|
float currentX = current.getSource().getStartPosition()[0];
|
float targetX = 0;
|
for(CellObject co : rowCellList){
|
targetX = co.getSource().getStartPosition()[0];
|
if(Math.abs(currentX - targetX) <= cellTolerance){
|
logger.debug("找到匹配的下一行单元格:" + co.getSource().getText());
|
return co;
|
}
|
}
|
}
|
return null;
|
}
|
|
public static final String parseSplitTitleAndValue(CellObject cellObject, boolean splitTitleValue){
|
String text = cellObject.getSource().getText();
|
if( splitTitleValue && text.indexOf(Constants.COLON_EN) > 0){
|
String[] titleValue = text.split(Constants.COLON_EN);
|
if(titleValue.length != 2){
|
logger.error("内容通过:分隔为空=" + text);
|
// return null;
|
return StringUtils.EMPTY_STRING;
|
}
|
return titleValue[1];
|
}
|
// 没有冒号分隔,需要去掉标题就是值
|
if(cellObject.isConfigurable()){
|
// 对于配置的单元格,需要处理标题和值粘一块的情况
|
// 标题匹配的开始索引值
|
// 如:name=车辆信核定载客, value=车辆信
|
int titleIndexStart = text.indexOf(cellObject.getCellConfigItem().getName());
|
String perhapsValue = text.replaceFirst(cellObject.getCellConfigItem().getName(), "");
|
if(titleIndexStart > 0 && perhapsValue.length() == titleIndexStart){
|
logger.debug("未检索到值:匹配的标题在后面,说明值可能不正确:" + text);
|
// return null;
|
return StringUtils.EMPTY_STRING;
|
}
|
if(perhapsValue.length() >= cellObject.getMinValueSize()){
|
logger.debug("在标题格找到可能的值:" + perhapsValue + ", title=" + cellObject.getCellConfigItem().getName());
|
return perhapsValue;
|
}
|
// return null;
|
return StringUtils.EMPTY_STRING;
|
}
|
// 如果单元格是非配置(普通值)
|
return text;
|
}
|
|
/**
|
* 解析整行单元格值数据,可能包含两行。不包含:多行数据(单独方法处理)
|
* @param rowData 标题行(有多列)
|
* @param current 当前标题单元格
|
* @param nextLineCells 下一行信息
|
* @return
|
*/
|
public static final String parseFullRowTwoLineValue(List<CellObject> rowData
|
, CellObject current, List<CellObject> nextLineCells){
|
|
String result = parseSplitTitleAndValue(current, false);
|
StringBuilder sb = new StringBuilder(result);
|
if(rowData != null){
|
sb.append(";");
|
for(CellObject co : rowData){
|
if(!co.isConfigurable() && co.getOrderColumn() > 0){
|
sb.append(co.getSource().getText()).append(";");
|
}
|
}
|
}
|
// 加上下一行信息
|
if(nextLineCells != null){
|
for(CellObject co : nextLineCells){
|
if(!co.isConfigurable()){
|
sb.append(co.getSource().getText()).append(";");
|
}
|
}
|
}
|
return sb.toString();
|
}
|
|
/**
|
* 解析两行文本块对应的值。
|
* @param currentValue 给定当前值单元格
|
* @param nextLineValue 找到的下一行同列单元格值
|
* @return
|
*/
|
public static final String parseCellTwoLineValue(CellObject currentValue, CellObject nextLineValue, boolean splitTitleValue){
|
if(nextLineValue == null){
|
return parseSplitTitleAndValue(currentValue, splitTitleValue);
|
}
|
StringBuilder sb = new StringBuilder();
|
if(currentValue.getSource().getStartPosition()[1] < nextLineValue.getSource().getStartPosition()[1]){
|
sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue));
|
sb.append(nextLineValue.getSource().getText());
|
} else {
|
sb.append(nextLineValue.getSource().getText());
|
sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue));
|
}
|
// String result = parseSplitTitleAndValue(currentValue);
|
// if(nextLineValue != null){
|
// sb.append(";");
|
// sb.append(nextLineValue.getSource().getText());
|
// }
|
return sb.toString();
|
}
|
}
|