package com.walker.semantics.support;
|
|
import com.walker.infrastructure.ApplicationRuntimeException;
|
import com.walker.infrastructure.utils.StringUtils;
|
import com.walker.semantics.InputWord;
|
import com.walker.semantics.OwnerTextItem;
|
import com.walker.semantics.OwnerTextStore;
|
import com.walker.semantics.SemanticsManager;
|
import com.walker.semantics.SpeechPart;
|
import com.walker.semantics.TextSimilar;
|
import com.walker.semantics.TextSimilarEngine;
|
import com.walker.semantics.WordMeta;
|
import com.walker.semantics.util.TextSimilarComparator;
|
import org.slf4j.Logger;
|
import org.slf4j.LoggerFactory;
|
|
import java.util.ArrayList;
|
import java.util.Collections;
|
import java.util.HashMap;
|
import java.util.List;
|
import java.util.Map;
|
|
public abstract class AbstractTextSimilarEngine implements TextSimilarEngine {
|
|
protected final transient Logger logger = LoggerFactory.getLogger(getClass());
|
|
// 存储 key = 业务ID,value = 词库关键词Hash值(多个)
|
// private final Map<String, Integer[]> idAndWordHashCache = new HashMap<>(256);
|
// key = 业务ID,value = 原始关键词内容,逗号分隔的,回显数据使用
|
// private final Map<String, String> idAndTextCache = new HashMap<>(256);
|
private final TextSimilarComparator comparator = new TextSimilarComparator();
|
|
// 2023-09-12,增加归属人,支持多个用户同时使用
|
private final Map<String, OwnerTextStore> ownerTextStoreMap = new HashMap<>(4);
|
|
@Override
|
public SemanticsManager getSemanticsManager() {
|
return this.semanticsManager;
|
}
|
|
@Override
|
public List<TextSimilar> search(String[] text, String owner) {
|
if(StringUtils.isEmpty(owner)){
|
throw new IllegalArgumentException("owner必须输入");
|
}
|
if(text == null || text.length == 0){
|
throw new IllegalArgumentException("text必须输入");
|
}
|
OwnerTextStore ownerTextStore = this.ownerTextStoreMap.get(owner);
|
if(ownerTextStore == null){
|
// throw new IllegalStateException("没有找到owner关键词配置数据,owner=" + owner);
|
logger.warn("没有找到owner关键词配置数据,owner=" + owner);
|
return null;
|
}
|
|
Integer[] inputHashCode = new Integer[text.length];
|
for(int i=0; i<text.length; i++){
|
inputHashCode[i] = text[i].hashCode();
|
}
|
|
List<TextSimilar> data = new ArrayList<>(8);
|
TextSimilar textSimilar = null;
|
// for(Map.Entry<String, Integer[]> entry : this.idAndWordHashCache.entrySet()){
|
for(Map.Entry<String, Integer[]> entry : ownerTextStore.getIdAndWordHashCache().entrySet()){
|
textSimilar = this.calculateOneSimilar(inputHashCode, entry.getKey(), entry.getValue(), ownerTextStore.getIdAndTextCache());
|
if(textSimilar != null){
|
data.add(textSimilar);
|
// 如果存在完美匹配的,后面就不看了。2023-08-17
|
if(textSimilar.isPerfect()){
|
break;
|
}
|
}
|
}
|
|
// 对集合排序
|
Collections.sort(data, comparator);
|
return data;
|
}
|
|
protected TextSimilar calculateOneSimilar(Integer[] wordsHash
|
, String id, Integer[] userKeywordHash, Map<String, String> idAndTextCache){
|
int wn = wordsHash.length;
|
int rn = userKeywordHash.length;
|
TextSimilar textSimilar = new TextSimilar();
|
textSimilar.setId(id);
|
// textSimilar.setText(this.idAndTextCache.get(id));
|
textSimilar.setText(idAndTextCache.get(id));
|
|
int matchSize = 0;
|
|
if(wn > rn){
|
// 输入多于词库
|
for(Integer userHashCode : userKeywordHash){
|
for(Integer inputHashCode : wordsHash){
|
if(userHashCode.intValue() == inputHashCode.intValue()){
|
matchSize ++;
|
break;
|
}
|
}
|
}
|
|
if(matchSize == 0){
|
return null;
|
}
|
|
if(matchSize == rn){
|
// 输入的多,词库词条全命中,额外+2分
|
textSimilar.setDis(rn + 20);
|
} else {
|
// 输入的多,词库词条半命中,额外+1分
|
textSimilar.setDis(matchSize + 10);
|
}
|
|
} else if(wn == rn){
|
// 输入等于词库
|
for(Integer userHashCode : userKeywordHash){
|
for(Integer inputHashCode : wordsHash){
|
if(userHashCode.intValue() == inputHashCode.intValue()){
|
matchSize ++;
|
break;
|
}
|
}
|
}
|
|
if(matchSize == 0){
|
return null;
|
}
|
if(matchSize == rn){
|
// 输入与词库相等,词库词条全命中,额外+2分
|
textSimilar.setDis(matchSize + 100);
|
textSimilar.setPerfect(true);
|
} else {
|
// 输入与词库相等,词库词条半命中,额外+5分
|
textSimilar.setDis(matchSize + 20);
|
}
|
|
} else if(wn < rn) {
|
// 输入少于词库
|
for(Integer inputHashCode : wordsHash){
|
for(Integer userHashCode : userKeywordHash){
|
if(inputHashCode.intValue() == userHashCode.intValue()){
|
matchSize ++;
|
break;
|
}
|
}
|
}
|
|
if(matchSize == 0){
|
return null;
|
}
|
if(matchSize == wn){
|
// 输入与词库相等,输入全命中,额外+2分
|
textSimilar.setDis(matchSize + 80);
|
} else {
|
// 输入与词库相等,输入半命中,额外+5分
|
textSimilar.setDis(matchSize + 20);
|
}
|
}
|
|
return textSimilar;
|
}
|
|
@Override
|
public void loadLibrary() {
|
// List<Variable> data = this.doLoadUserLibrary();
|
// if(StringUtils.isEmptyList(data)){
|
// logger.warn("未加载到任何用户词库,TextSimilarEngine可能无法正常工作");
|
// return;
|
// }
|
// String id = null;
|
// for(Variable v : data){
|
// id = v.getId();
|
// if(this.idAndWordHashCache.get(id) != null){
|
// throw new IllegalArgumentException("用户词库中已经存在该业务id = " + id);
|
// }
|
// this.idAndWordHashCache.put(id, SemanticsUtils.acquireWordHashCode(v.getStringValue()));
|
// this.idAndTextCache.put(id, v.getStringValue());
|
// }
|
List<OwnerTextItem> data = this.doLoadUserLibrary();
|
if(StringUtils.isEmptyList(data)){
|
logger.warn("未加载到任何用户词库,TextSimilarEngine可能无法正常工作");
|
return;
|
}
|
|
OwnerTextStore ownerTextStore = null;
|
for(OwnerTextItem item : data){
|
ownerTextStore = this.ownerTextStoreMap.get(item.getOwner());
|
if(ownerTextStore == null){
|
ownerTextStore = new OwnerTextStore(item.getOwner());
|
this.ownerTextStoreMap.put(item.getOwner(), ownerTextStore);
|
}
|
if(ownerTextStore.getWordHashCode(item.getId()) != null){
|
throw new IllegalArgumentException("用户词库中已经存在该业务id = " + item.getId() + ", owner =" + item.getOwner());
|
}
|
ownerTextStore.putWord(item.getId(), item.getText());
|
}
|
}
|
|
/**
|
* 业务系统加载自己的词库数据。
|
* <pre>
|
* 1) key = 业务ID
|
* 2) value = 关键词,多个使用英文逗号分隔
|
* </pre>
|
* @return
|
* @date 2023-08-17
|
*/
|
// protected abstract List<Variable> doLoadUserLibrary();
|
protected abstract List<OwnerTextItem> doLoadUserLibrary();
|
|
@Override
|
public void registerKeyword(String[] words, SpeechPart speechPart) {
|
this.checkSemanticsManager();
|
if(words == null || words.length == 0){
|
throw new IllegalArgumentException("请设置words");
|
}
|
if(speechPart == null){
|
speechPart = SpeechPart.MY_N;
|
}
|
try {
|
for(String word : words){
|
this.semanticsManager.registerKeyWord(0, word, speechPart);
|
}
|
} catch (Exception ex){
|
logger.error("向词库注册关键词,出现异常:" + ex.getMessage(), ex);
|
throw new ApplicationRuntimeException("向词库注册关键词,出现异常:" + ex.getMessage(), ex);
|
}
|
}
|
|
@Override
|
public void removeKeyword(String keyword) {
|
this.checkSemanticsManager();
|
this.semanticsManager.removeKeyWord(0, keyword);
|
}
|
|
@Override
|
public List<WordMeta> extract(String input) {
|
if(StringUtils.isEmpty(input)){
|
throw new IllegalArgumentException("input必须输入");
|
}
|
input = input.trim().toLowerCase();
|
InputWord inputWord = new InputWord(input);
|
return inputWord.getWordMetaList();
|
}
|
|
public void setSemanticsManager(SemanticsManager semanticsManager) {
|
this.semanticsManager = semanticsManager;
|
}
|
|
private void checkSemanticsManager(){
|
if(this.semanticsManager == null){
|
throw new IllegalStateException("semanticsManager必须设置");
|
}
|
}
|
|
private SemanticsManager semanticsManager;
|
}
|