package com.walker.openocr.table;
|
|
import com.walker.openocr.RecognizeResult;
|
import com.walker.openocr.TextBlock;
|
import com.walker.openocr.util.TableConfigUtils;
|
import com.walker.openocr.util.TableObjectUtils;
|
|
import java.util.ArrayList;
|
import java.util.HashMap;
|
import java.util.Iterator;
|
import java.util.LinkedList;
|
import java.util.List;
|
import java.util.Map;
|
|
public class TableObject extends RecognizeResult<TableConfig> {
|
|
// protected final transient Logger logger = LoggerFactory.getLogger(this.getClass());
|
|
private TableConfig tableConfig = null;
|
|
// private Map<Integer, List<CellObject>> rowCache = new TreeMap<>();
|
|
// 其他还未排序的单元格集合,包含:需要的值以及无用的单元格
|
protected List<CellObject> otherCellObjectList = new ArrayList<>();
|
|
// 配置定义 与 文本快 对应关系,因为存在多个文本匹配同一个配置情况,所以需要再次比较最匹配的。
|
// 如:配置项(死亡伤残赔偿限额)
|
// “死亡伤残赔偿限额180000元” 和 “无责任死亡伤残赔偿限额18000元” 都匹配该项
|
private Map<String, CellObject> configItemBlockCache = new HashMap<>();
|
|
public TableObject(TableConfig tableConfig){
|
if(tableConfig == null){
|
throw new IllegalArgumentException("TableConfig is required!");
|
}
|
this.tableConfig = tableConfig;
|
}
|
|
@Override
|
public TableConfig getRecognizeConfig(){
|
if(this.tableConfig == null){
|
logger.error("TableConfig 不存在!");
|
return null;
|
}
|
return this.tableConfig;
|
}
|
|
/**
|
* 返回解析的表格数据结果,map中:
|
* key = 业务配置的ID,value = 单元格对象
|
* @return
|
* @date 2022-09-02
|
*/
|
public Map<String, CellObject> getTableDataMap(){
|
Map<String, CellObject> map = new HashMap<>();
|
if(this.rowCache.size() == 0){
|
return map;
|
}
|
for (List<CellObject> list : this.rowCache.values()){
|
for(CellObject co : list){
|
if(co.isConfigurable()){
|
map.put(co.getId(), co);
|
}
|
}
|
}
|
return map;
|
}
|
|
/**
|
* 计算标题对应的值。
|
*/
|
@Override
|
public void calculateValue(){
|
int cellSize = 0;
|
for(List<CellObject> rowData : this.rowCache.values()){
|
// 每行处理
|
cellSize = rowData.size();
|
CellObject current = null;
|
String cellValue = null;
|
for(int i=0; i<cellSize; i++){
|
current = rowData.get(i);
|
if(current.isConfigurable()){
|
|
if(current.isConfigNoneCell()){
|
// 不是连续单元格,只能根据内容拆分出来:标题和值
|
cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
|
|
} else if (current.isConfigFullRow()) {
|
// 完整一行单元格
|
if(current.isConfigTwoLine()){
|
List<CellObject> nextLineCells = this.acquireTwoLineCellInfo(current);
|
cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, nextLineCells);
|
} else if(current.isConfigSingleLine()){
|
cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, null);
|
} else if(current.isConfigMoreLine()){
|
throw new UnsupportedOperationException("未实现完整行(多行文本)代码");
|
}
|
|
} else if(current.isConfigTwoLine()) {
|
// 普通单元格:确定两行
|
if(this.isExistValueCell(i, rowData)){
|
CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i+1));
|
if(nextLineCell != null){
|
logger.debug("找到第二行值:" + nextLineCell.getSource().getText());
|
cellValue = TableObjectUtils.parseCellTwoLineValue(rowData.get(i+1), nextLineCell, false);
|
}
|
} else {
|
// 未找到标题后面的值单元格,很可能由于不是同一行而忽略,此时需要从备选格中查找最近的行
|
CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i));
|
if(nextLineCell != null){
|
logger.debug("找到第二行值2:" + nextLineCell.getSource().getText());
|
cellValue = TableObjectUtils.parseCellTwoLineValue(nextLineCell, null, false);
|
}
|
}
|
|
} else if (current.isConfigSingleLine()) {
|
// 先解析看是否存在标题与值粘连的情况
|
cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
|
if(cellValue == null){
|
// 普通单元格:单行
|
if(this.isExistValueCell(i, rowData)){
|
// 存在值单元格
|
cellValue = rowData.get(i+1).getSource().getText();
|
} else {
|
// 没有值单元格
|
cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
|
}
|
}
|
|
} else {
|
throw new UnsupportedOperationException("未实现代码,不确定的单元格:" + current.getSource().getText());
|
}
|
current.setValue(cellValue);
|
}
|
}
|
}
|
}
|
|
private boolean isExistValueCell(int currentIndex, List<CellObject> rowData){
|
if(currentIndex + 1 < rowData.size()){
|
return true;
|
}
|
return false;
|
}
|
|
/**
|
* 获取两行单元格(值单元格)
|
* @param cellObject 给定的当前单元格,可以是:配置的标题格,也可以是已确定的第一行值单元格
|
* @return
|
*/
|
protected CellObject acquireTwoLineCell(CellObject cellObject){
|
// if(cellObject.isConfigurable()){
|
// throw new IllegalArgumentException("acquireTwoLineCell:不能是可配置单元格");
|
// }
|
return TableObjectUtils.findNextRowCell(cellObject
|
, this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance());
|
}
|
|
protected List<CellObject> acquireTwoLineCellInfo(CellObject configurableCellObject){
|
if(!configurableCellObject.isConfigTwoLine()){
|
throw new IllegalArgumentException("acquireTwoLineCellInfo:单元格不是两行配置,无法调用该方法");
|
}
|
return TableObjectUtils.findNextRowInfo(configurableCellObject
|
, this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance());
|
}
|
|
/**
|
* 添加完文本块后,排序,把相关行内容放在一起,并对每一行进行列排序。
|
*/
|
@Override
|
public void sortCellObjectList(){
|
CellObject target = null;
|
for(Iterator<CellObject> it = this.otherCellObjectList.iterator(); it.hasNext();){
|
// for(CellObject target : this.otherCellObjectList){
|
target = it.next();
|
List<CellObject> rowSet = null;
|
boolean sameRow = false;
|
for(Map.Entry<Integer, List<CellObject>> entry : this.rowCache.entrySet()){
|
rowSet = entry.getValue();
|
sameRow = TableObjectUtils.isInSameRow(rowSet.get(0), target, tableConfig.getCellTolerance());
|
if(sameRow){
|
logger.debug("找到匹配行:" + entry.getKey() + ", target=" + target.getSource().getText());
|
rowSet.add(target);
|
// 从other集合移除该单元格
|
it.remove();
|
// 退出该循环,从下一个目标元素继续执行
|
break;
|
}
|
}
|
}
|
|
// 每行对列排序
|
// int orderColumn = 0;
|
for(List<CellObject> list : this.rowCache.values()){
|
TableObjectUtils.sortColumnCellList(list);
|
}
|
}
|
|
@Override
|
public void addTextBlock(TextBlock textBlock){
|
CellObject cellObject = new CellObject();
|
cellObject.setSource(textBlock);
|
ConfigItem configItem = this.tableConfig.findConfigItem(textBlock.getText());
|
if(configItem != null){
|
if(configItem instanceof ColumnConfigItem){
|
throw new UnsupportedOperationException("未实现代码:ColumnConfigItem");
|
}
|
if(configItem instanceof CellConfigItem){
|
CellConfigItem cellConfigItem = (CellConfigItem) configItem;
|
cellObject.setCellConfigItem(cellConfigItem);
|
cellObject.setId(cellConfigItem.getId());
|
cellObject.setRowNum(cellConfigItem.getOrderNum());
|
}
|
this.putCellObjectToRowCache(cellObject, configItem);
|
|
} else {
|
cellObject.setId(this.generateCellId());
|
this.otherCellObjectList.add(cellObject);
|
}
|
}
|
|
private void putCellObjectToRowCache(CellObject cellObject, ConfigItem configItem){
|
CellObject textBlockExist = this.configItemBlockCache.get(configItem.getId());
|
if(textBlockExist == null){
|
this.configItemBlockCache.put(configItem.getId(), cellObject);
|
this.putOneRowCache(cellObject);
|
|
} else {
|
// 如果已经存在匹配配置项的文本块,说明有些文本也存在关键词,需要根据评分检查。
|
CellObject matchedCellObject = this.findMaxScoreCellObject(textBlockExist, cellObject, configItem);
|
this.configItemBlockCache.put(configItem.getId(), matchedCellObject);
|
this.putOneRowCache(matchedCellObject);
|
}
|
}
|
|
private CellObject findMaxScoreCellObject(CellObject textBlockExist, CellObject cellObject, ConfigItem configItem){
|
float existScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), textBlockExist.getSource().getText());
|
float newScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), cellObject.getSource().getText());
|
if(newScore > existScore){
|
logger.debug("新文本快更加匹配配置项:" + configItem.getName() + ", newScore=" + newScore + ", text=" + cellObject.getSource().getText());
|
return cellObject;
|
} else {
|
logger.debug("已有文本快更适合:" + configItem.getName() + ", existScore=" + existScore + ", text=" + textBlockExist.getSource().getText());
|
return textBlockExist;
|
}
|
}
|
|
private void putOneRowCache(CellObject cellObject){
|
List<CellObject> list = this.rowCache.get(cellObject.getRowNum());
|
if(list == null){
|
list = new LinkedList<>();
|
this.rowCache.put(cellObject.getRowNum(), list);
|
}
|
if(!list.contains(cellObject)){
|
list.add(cellObject);
|
}
|
}
|
|
private String generateCellId(){
|
return String.valueOf(System.nanoTime());
|
}
|
|
@Override
|
public void printRowCache(){
|
// List<CellObject> data = null;
|
// for(Map.Entry<Integer, List<CellObject>> entry : this.rowCache.entrySet()){
|
// logger.debug("第 " + entry.getKey() + " 行 ==========================================");
|
// data = entry.getValue();
|
// if(data != null){
|
// for(CellObject co : data){
|
// logger.debug(co.toString());
|
// }
|
// }
|
// }
|
super.printRowCache();
|
logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
|
for(CellObject other : this.otherCellObjectList){
|
logger.debug(other.toString());
|
}
|
}
|
}
|