package com.walker.openocr.util;
import com.walker.infrastructure.utils.StringUtils;
import com.walker.openocr.Constants;
import com.walker.openocr.TextBlock;
import com.walker.openocr.table.CellObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class TableObjectUtils {
private static final transient Logger logger = LoggerFactory.getLogger(TableObjectUtils.class);
/**
* 给定文本集合中,是否包含应该排除的关键词。
*
* 1) 例如:标题中不能包含:中油联合,如果包含了则返回true
*
* @param dataList
* @param tableKeysNot
* @return
* @date 2023-11-01
*/
public static final boolean containTableKeyNot(List dataList, List tableKeysNot){
for(TextBlock textBlock : dataList){
for(String notKey : tableKeysNot){
if(textBlock.getText().indexOf(notKey) >= 0){
return true;
}
}
}
return false;
}
/**
* 格式化识别文本,把必要的中文符号转成英文。
* @param text
* @return
*/
public static final String formatText(String text){
text = text.replaceAll(Constants.LEFT_PARENTHESES, Constants.LEFT_PARENTHESES_EN);
text = text.replaceAll(Constants.RIGHT_PARENTHESES, Constants.RIGHT_PARENTHESES_EN);
text = text.replaceAll(Constants.COLON_ZH, Constants.COLON_EN);
return text.trim();
}
/**
* 两个单元格是否在一行
* @param source
* @param target
* @param cellTolerance 位置像素容差值
* @return
*/
public static final boolean isInSameRow(CellObject source, CellObject target, int cellTolerance){
// 用开始的y坐标比较一次
float sourceY = source.getSource().getStartPosition()[1];
float targetY = target.getSource().getStartPosition()[1];
if((Math.abs(sourceY - targetY)) <= cellTolerance){
return true;
}
// 用结束的y坐标再比较一次
sourceY = source.getSource().getEndPosition()[1];
targetY = target.getSource().getEndPosition()[1];
if((Math.abs(sourceY - targetY)) <= cellTolerance){
return true;
}
return false;
}
public static final void sortColumnCellList(List list){
int columnSort = 1;
CellObject min = findMinColumnCellObject(list, null);
min.setOrderColumn(columnSort);
CellObject currentMin = min;
for(CellObject cellObject : list){
columnSort++;
currentMin = findMinColumnCellObject(list, currentMin);
if(currentMin != null){
currentMin.setOrderColumn(columnSort);
}
}
Collections.sort(list);
}
private static CellObject findMinColumnCellObject(List list, CellObject minCell){
CellObject find = null;
for(CellObject co : list){
// 已经比较过的单元格忽略
if(minCell != null
&& co.getSource().getStartPosition()[0] <= minCell.getSource().getStartPosition()[0]){
continue;
}
if(find == null){
find = co;
continue;
}
if(co.getSource().getStartPosition()[0] < find.getSource().getStartPosition()[0]){
find = co;
}
}
return find;
}
/**
* 找到当前单元格下一行单元格集合
* @param current 当前单元格(已经匹配配置项)
* @param others 给定集合
* @param lineHeight 定义的每行高度
* @return
*/
public static List findNextRowInfo(CellObject current
, List others, int lineHeight, int cellTolerance){
if(others == null || others.size() == 0){
return null;
}
List resultList = new ArrayList<>(4);
float currentY = current.getSource().getStartPosition()[1];
float nextRowY = 0;
float distance = 0;
for(CellObject co : others){
nextRowY = co.getSource().getStartPosition()[1];
distance = Math.abs(currentY - nextRowY);
if((distance + cellTolerance) >= lineHeight && distance <= 2*lineHeight){
logger.debug("distance=" + distance + ", lineHeight=" + lineHeight + ", text=" + co.getSource().getText());
resultList.add(co);
}
}
return resultList;
}
/**
* 搜索下一行某个确定单元格(列),如:五羊-本田WH125T-9A两 / 轮摩托车
* @param current 该单元格不一定是匹配过的,也可能仅是一个存在值的单元格。
* @param others
* @param lineHeight
* @param cellTolerance
* @return 精确返回一个单元格
*/
public static final CellObject findNextRowCell(CellObject current
, List others, int lineHeight, int cellTolerance){
List rowCellList = findNextRowInfo(current, others, lineHeight, cellTolerance);
if(rowCellList == null || rowCellList.size() == 0){
return null;
}
if(current.isConfigurable()){
// 标题单元格,一般值列在右侧,而且是多行,返回最靠上一行(第二行不管)
float currentX = current.getSource().getEndPosition()[0];
float currentY = current.getSource().getEndPosition()[1];
float targetX = 0;
CellObject topCell = null;
for(CellObject co : rowCellList){
targetX = co.getSource().getStartPosition()[0];
if(targetX < currentX){
// 左侧的不管
continue;
}
if(co.getSource().getStartPosition()[1] <= currentY){
logger.debug("标题格 右侧 找到多行值单元格靠上一个:" + co.getSource().getText());
return co;
}
}
} else {
// 查找普通值单元格的下一行(或上一行),因为点多行时值为多行,标题列为单行,会错位。
float currentX = current.getSource().getStartPosition()[0];
float targetX = 0;
for(CellObject co : rowCellList){
targetX = co.getSource().getStartPosition()[0];
if(Math.abs(currentX - targetX) <= cellTolerance){
logger.debug("找到匹配的下一行单元格:" + co.getSource().getText());
return co;
}
}
}
return null;
}
public static final String parseSplitTitleAndValue(CellObject cellObject, boolean splitTitleValue){
String text = cellObject.getSource().getText();
if( splitTitleValue && text.indexOf(Constants.COLON_EN) > 0){
String[] titleValue = text.split(Constants.COLON_EN);
if(titleValue.length != 2){
logger.error("内容通过:分隔为空=" + text);
// return null;
return StringUtils.EMPTY_STRING;
}
return titleValue[1];
}
// 没有冒号分隔,需要去掉标题就是值
if(cellObject.isConfigurable()){
// 对于配置的单元格,需要处理标题和值粘一块的情况
// 标题匹配的开始索引值
// 如:name=车辆信核定载客, value=车辆信
int titleIndexStart = text.indexOf(cellObject.getCellConfigItem().getName());
String perhapsValue = text.replaceFirst(cellObject.getCellConfigItem().getName(), "");
if(titleIndexStart > 0 && perhapsValue.length() == titleIndexStart){
logger.debug("未检索到值:匹配的标题在后面,说明值可能不正确:" + text);
// return null;
return StringUtils.EMPTY_STRING;
}
if(perhapsValue.length() >= cellObject.getMinValueSize()){
logger.debug("在标题格找到可能的值:" + perhapsValue + ", title=" + cellObject.getCellConfigItem().getName());
return perhapsValue;
}
// return null;
return StringUtils.EMPTY_STRING;
}
// 如果单元格是非配置(普通值)
return text;
}
/**
* 解析整行单元格值数据,可能包含两行。不包含:多行数据(单独方法处理)
* @param rowData 标题行(有多列)
* @param current 当前标题单元格
* @param nextLineCells 下一行信息
* @return
*/
public static final String parseFullRowTwoLineValue(List rowData
, CellObject current, List nextLineCells){
String result = parseSplitTitleAndValue(current, false);
StringBuilder sb = new StringBuilder(result);
if(rowData != null){
sb.append(";");
for(CellObject co : rowData){
if(!co.isConfigurable() && co.getOrderColumn() > 0){
sb.append(co.getSource().getText()).append(";");
}
}
}
// 加上下一行信息
if(nextLineCells != null){
for(CellObject co : nextLineCells){
if(!co.isConfigurable()){
sb.append(co.getSource().getText()).append(";");
}
}
}
return sb.toString();
}
/**
* 解析两行文本块对应的值。
* @param currentValue 给定当前值单元格
* @param nextLineValue 找到的下一行同列单元格值
* @return
*/
public static final String parseCellTwoLineValue(CellObject currentValue, CellObject nextLineValue, boolean splitTitleValue){
if(nextLineValue == null){
return parseSplitTitleAndValue(currentValue, splitTitleValue);
}
StringBuilder sb = new StringBuilder();
if(currentValue.getSource().getStartPosition()[1] < nextLineValue.getSource().getStartPosition()[1]){
sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue));
sb.append(nextLineValue.getSource().getText());
} else {
sb.append(nextLineValue.getSource().getText());
sb.append(parseSplitTitleAndValue(currentValue, splitTitleValue));
}
// String result = parseSplitTitleAndValue(currentValue);
// if(nextLineValue != null){
// sb.append(";");
// sb.append(nextLineValue.getSource().getText());
// }
return sb.toString();
}
}