shikeying
2024-01-11 3b67e947e36133e2a40eb2737b15ea375e157ea0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
package com.walker.openocr.table;
 
import com.walker.openocr.RecognizeResult;
import com.walker.openocr.TextBlock;
import com.walker.openocr.util.TableConfigUtils;
import com.walker.openocr.util.TableObjectUtils;
 
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
 
public class TableObject extends RecognizeResult<TableConfig> {
 
//    protected final transient Logger logger = LoggerFactory.getLogger(this.getClass());
 
    private TableConfig tableConfig = null;
 
//    private Map<Integer, List<CellObject>> rowCache = new TreeMap<>();
 
    // 其他还未排序的单元格集合,包含:需要的值以及无用的单元格
    protected List<CellObject> otherCellObjectList = new ArrayList<>();
 
    // 配置定义 与 文本快 对应关系,因为存在多个文本匹配同一个配置情况,所以需要再次比较最匹配的。
    // 如:配置项(死亡伤残赔偿限额)
    // “死亡伤残赔偿限额180000元” 和 “无责任死亡伤残赔偿限额18000元” 都匹配该项
    private Map<String, CellObject> configItemBlockCache = new HashMap<>();
 
    public TableObject(TableConfig tableConfig){
        if(tableConfig == null){
            throw new IllegalArgumentException("TableConfig is required!");
        }
        this.tableConfig = tableConfig;
    }
 
    @Override
    public TableConfig getRecognizeConfig(){
        if(this.tableConfig == null){
            logger.error("TableConfig 不存在!");
            return null;
        }
        return this.tableConfig;
    }
 
    /**
     * 返回解析的表格数据结果,map中:
     * key = 业务配置的ID,value = 单元格对象
     * @return
     * @date 2022-09-02
     */
    public Map<String, CellObject> getTableDataMap(){
        Map<String, CellObject> map = new HashMap<>();
        if(this.rowCache.size() == 0){
            return map;
        }
        for (List<CellObject> list : this.rowCache.values()){
            for(CellObject co : list){
                if(co.isConfigurable()){
                    map.put(co.getId(), co);
                }
            }
        }
        return map;
    }
 
    /**
     * 计算标题对应的值。
     */
    @Override
    public void calculateValue(){
        int cellSize = 0;
        for(List<CellObject> rowData : this.rowCache.values()){
            // 每行处理
            cellSize = rowData.size();
            CellObject current = null;
            String cellValue = null;
            for(int i=0; i<cellSize; i++){
                current = rowData.get(i);
                if(current.isConfigurable()){
 
                    if(current.isConfigNoneCell()){
                        // 不是连续单元格,只能根据内容拆分出来:标题和值
                        cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
 
                    } else if (current.isConfigFullRow()) {
                        // 完整一行单元格
                        if(current.isConfigTwoLine()){
                            List<CellObject> nextLineCells = this.acquireTwoLineCellInfo(current);
                            cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, nextLineCells);
                        } else if(current.isConfigSingleLine()){
                            cellValue = TableObjectUtils.parseFullRowTwoLineValue(rowData, current, null);
                        } else if(current.isConfigMoreLine()){
                            throw new UnsupportedOperationException("未实现完整行(多行文本)代码");
                        }
 
                    } else if(current.isConfigTwoLine()) {
                        // 普通单元格:确定两行
                        if(this.isExistValueCell(i, rowData)){
                            CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i+1));
                            if(nextLineCell != null){
                                logger.debug("找到第二行值:" + nextLineCell.getSource().getText());
                                cellValue = TableObjectUtils.parseCellTwoLineValue(rowData.get(i+1), nextLineCell, false);
                            }
                        } else {
                            // 未找到标题后面的值单元格,很可能由于不是同一行而忽略,此时需要从备选格中查找最近的行
                            CellObject nextLineCell = this.acquireTwoLineCell(rowData.get(i));
                            if(nextLineCell != null){
                                logger.debug("找到第二行值2:" + nextLineCell.getSource().getText());
                                cellValue = TableObjectUtils.parseCellTwoLineValue(nextLineCell, null, false);
                            }
                        }
 
                    } else if (current.isConfigSingleLine()) {
                        // 先解析看是否存在标题与值粘连的情况
                        cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
                        if(cellValue == null){
                            // 普通单元格:单行
                            if(this.isExistValueCell(i, rowData)){
                                // 存在值单元格
                                cellValue = rowData.get(i+1).getSource().getText();
                            } else {
                                // 没有值单元格
                                cellValue = TableObjectUtils.parseSplitTitleAndValue(current, true);
                            }
                        }
 
                    } else {
                        throw new UnsupportedOperationException("未实现代码,不确定的单元格:" + current.getSource().getText());
                    }
                    current.setValue(cellValue);
                }
            }
        }
    }
 
    private boolean isExistValueCell(int currentIndex, List<CellObject> rowData){
        if(currentIndex + 1 < rowData.size()){
            return true;
        }
        return false;
    }
 
    /**
     * 获取两行单元格(值单元格)
     * @param cellObject 给定的当前单元格,可以是:配置的标题格,也可以是已确定的第一行值单元格
     * @return
     */
    protected CellObject acquireTwoLineCell(CellObject cellObject){
//        if(cellObject.isConfigurable()){
//            throw new IllegalArgumentException("acquireTwoLineCell:不能是可配置单元格");
//        }
        return TableObjectUtils.findNextRowCell(cellObject
                , this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance());
    }
 
    protected List<CellObject> acquireTwoLineCellInfo(CellObject configurableCellObject){
        if(!configurableCellObject.isConfigTwoLine()){
            throw new IllegalArgumentException("acquireTwoLineCellInfo:单元格不是两行配置,无法调用该方法");
        }
        return TableObjectUtils.findNextRowInfo(configurableCellObject
                , this.otherCellObjectList, tableConfig.getMultiLineTolerance(), tableConfig.getCellTolerance());
    }
 
    /**
     * 添加完文本块后,排序,把相关行内容放在一起,并对每一行进行列排序。
     */
    @Override
    public void sortCellObjectList(){
        CellObject target = null;
        for(Iterator<CellObject> it = this.otherCellObjectList.iterator(); it.hasNext();){
//        for(CellObject target : this.otherCellObjectList){
            target = it.next();
            List<CellObject> rowSet = null;
            boolean sameRow = false;
            for(Map.Entry<Integer, List<CellObject>> entry : this.rowCache.entrySet()){
                rowSet = entry.getValue();
                sameRow = TableObjectUtils.isInSameRow(rowSet.get(0), target, tableConfig.getCellTolerance());
                if(sameRow){
                    logger.debug("找到匹配行:" + entry.getKey() + ", target=" + target.getSource().getText());
                    rowSet.add(target);
                    // 从other集合移除该单元格
                    it.remove();
                    // 退出该循环,从下一个目标元素继续执行
                    break;
                }
            }
        }
 
        // 每行对列排序
//        int orderColumn = 0;
        for(List<CellObject> list : this.rowCache.values()){
            TableObjectUtils.sortColumnCellList(list);
        }
    }
 
    @Override
    public void addTextBlock(TextBlock textBlock){
        CellObject cellObject = new CellObject();
        cellObject.setSource(textBlock);
        ConfigItem configItem = this.tableConfig.findConfigItem(textBlock.getText());
        if(configItem != null){
            if(configItem instanceof ColumnConfigItem){
                throw new UnsupportedOperationException("未实现代码:ColumnConfigItem");
            }
            if(configItem instanceof CellConfigItem){
                CellConfigItem cellConfigItem = (CellConfigItem) configItem;
                cellObject.setCellConfigItem(cellConfigItem);
                cellObject.setId(cellConfigItem.getId());
                cellObject.setRowNum(cellConfigItem.getOrderNum());
            }
            this.putCellObjectToRowCache(cellObject, configItem);
 
        } else {
            cellObject.setId(this.generateCellId());
            this.otherCellObjectList.add(cellObject);
        }
    }
 
    private void putCellObjectToRowCache(CellObject cellObject, ConfigItem configItem){
        CellObject textBlockExist = this.configItemBlockCache.get(configItem.getId());
        if(textBlockExist == null){
            this.configItemBlockCache.put(configItem.getId(), cellObject);
            this.putOneRowCache(cellObject);
 
        } else {
            // 如果已经存在匹配配置项的文本块,说明有些文本也存在关键词,需要根据评分检查。
            CellObject matchedCellObject = this.findMaxScoreCellObject(textBlockExist, cellObject, configItem);
            this.configItemBlockCache.put(configItem.getId(), matchedCellObject);
            this.putOneRowCache(matchedCellObject);
        }
    }
 
    private CellObject findMaxScoreCellObject(CellObject textBlockExist, CellObject cellObject, ConfigItem configItem){
        float existScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), textBlockExist.getSource().getText());
        float newScore = TableConfigUtils.getSimpleSimilarScore(configItem.getName(), cellObject.getSource().getText());
        if(newScore > existScore){
            logger.debug("新文本快更加匹配配置项:" + configItem.getName() + ", newScore=" + newScore + ", text=" + cellObject.getSource().getText());
            return cellObject;
        } else {
            logger.debug("已有文本快更适合:" + configItem.getName() + ", existScore=" + existScore + ", text=" + textBlockExist.getSource().getText());
            return textBlockExist;
        }
    }
 
    private void putOneRowCache(CellObject cellObject){
        List<CellObject> list = this.rowCache.get(cellObject.getRowNum());
        if(list == null){
            list = new LinkedList<>();
            this.rowCache.put(cellObject.getRowNum(), list);
        }
        if(!list.contains(cellObject)){
            list.add(cellObject);
        }
    }
 
    private String generateCellId(){
        return String.valueOf(System.nanoTime());
    }
 
    @Override
    public void printRowCache(){
//        List<CellObject> data = null;
//        for(Map.Entry<Integer, List<CellObject>> entry : this.rowCache.entrySet()){
//            logger.debug("第 " + entry.getKey() + " 行 ==========================================");
//            data = entry.getValue();
//            if(data != null){
//                for(CellObject co : data){
//                    logger.debug(co.toString());
//                }
//            }
//        }
        super.printRowCache();
        logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        for(CellObject other : this.otherCellObjectList){
            logger.debug(other.toString());
        }
    }
}