package com.walker.semantics.support;
|
|
import com.walker.infrastructure.utils.StringUtils;
|
import com.walker.semantics.ExtractorException;
|
import com.walker.semantics.InputWord;
|
import com.walker.semantics.SemanticsManager;
|
import com.walker.semantics.SummaryExtractor;
|
import com.walker.semantics.SummaryMeta;
|
import com.walker.semantics.SummaryQuery;
|
import com.walker.semantics.WordKey;
|
import org.ansj.app.keyword.KeyWordComputer;
|
import org.ansj.app.keyword.Keyword;
|
import org.ansj.app.summary.SummaryComputer;
|
import org.ansj.app.summary.TagContent;
|
import org.ansj.app.summary.pojo.Summary;
|
import org.slf4j.Logger;
|
import org.slf4j.LoggerFactory;
|
|
import java.util.ArrayList;
|
import java.util.Collection;
|
import java.util.List;
|
|
public abstract class AbstractSummaryExtractor implements SummaryExtractor {
|
|
protected final transient Logger logger = LoggerFactory.getLogger(getClass());
|
|
@Override
|
public SummaryMeta extract(SummaryQuery query) throws ExtractorException {
|
if(query == null){
|
throw new ExtractorException("未提供抽取条件,无法完成摘要抽取");
|
}
|
if(StringUtils.isEmpty(query.getContent())){
|
throw new ExtractorException("原始抽取素材不存在");
|
}
|
if(query.getMinScore() <= 0){
|
throw new ExtractorException("关键词分值必须大于0");
|
}
|
|
String input = query.getContent();
|
if(this.logger.isDebugEnabled()){
|
this.logger.debug(input);
|
}
|
|
InputWord inputWord = new InputWord(input);
|
if(inputWord.getWordMetaList().size() <= 3){
|
throw new ExtractorException("输入内容过少", query.getContent(), true);
|
}
|
logger.debug(inputWord.getWordMetaList().toString());
|
|
String referenceKeywords = null;
|
if(StringUtils.isNotEmpty(query.getKeywords())){
|
referenceKeywords = query.getKeywords();
|
} else {
|
referenceKeywords = this.acquireKeywords(null, input);
|
}
|
logger.debug("referenceKeywords = {}", referenceKeywords);
|
|
SummaryComputer summaryComputer = new SummaryComputer(query.getMaxLength(), null, input);
|
Summary summary = summaryComputer.toSummary(referenceKeywords);
|
List<Keyword> keywordList = summary.getKeyWords();
|
if(keywordList == null || keywordList.size() <= 1){
|
throw new ExtractorException("输入内容过少", query.getContent(), true);
|
}
|
|
double minScore = query.getMinScore();
|
StringBuilder title = new StringBuilder();
|
|
List<WordKey> wordKeyList = new ArrayList<>(keywordList.size());
|
for(Keyword keyword : keywordList){
|
if(keyword.getScore() >= minScore){
|
wordKeyList.add(new WordKey(keyword.getScore(), keyword.getName()));
|
title.append(keyword.getName());
|
}
|
}
|
|
SummaryMeta summaryMeta = new SummaryMeta();
|
summaryMeta.setTitle(title.toString());
|
summaryMeta.setSummary(summary.getSummary());
|
summaryMeta.setWordKeyList(wordKeyList);
|
|
// 如果存在标签,添加标签内容
|
if(StringUtils.isNotEmpty(query.getBeginTag()) && StringUtils.isNotEmpty(query.getEndTag())){
|
TagContent tagContent = new TagContent(query.getBeginTag(), query.getEndTag());
|
summaryMeta.setTagSummary(tagContent.tagContent(summary));
|
}
|
return summaryMeta;
|
}
|
|
/**
|
* 从内容中获取排名靠前的关键词集合,按照分值排名
|
* @param title 给定的参考标题,可选
|
* @param content 给定的原始内容素材
|
* @return 返回关键词字符串结果
|
*/
|
private String acquireKeywords(String title, String content){
|
KeyWordComputer kwc = new KeyWordComputer(20);
|
Collection<Keyword> result = kwc.computeArticleTfidf(title, content);
|
if(result == null || result.size() == 0){
|
return StringUtils.EMPTY_STRING;
|
}
|
|
StringBuilder sb = new StringBuilder();
|
int size = result.size();
|
float currentSize = 0;
|
for(Keyword kw : result){
|
if(currentSize == 0){
|
sb.append(kw.getName());
|
currentSize ++;
|
continue;
|
}
|
if(currentSize/size < keywordPercent){
|
sb.append(kw.getName());
|
currentSize ++;
|
continue;
|
} else {
|
break;
|
}
|
}
|
return sb.toString();
|
}
|
|
@Override
|
public void setSemanticsManager(SemanticsManager semanticsManager) {
|
this.semanticsManager = semanticsManager;
|
}
|
|
public SemanticsManager getSemanticsManager() {
|
return semanticsManager;
|
}
|
|
public float getKeywordPercent() {
|
return keywordPercent;
|
}
|
|
/**
|
* 设置提取关键词中的百分比,不能使用全部关键词,只取前百分比部分的。
|
* @param keywordPercent
|
*/
|
public void setKeywordPercent(float keywordPercent) {
|
this.keywordPercent = keywordPercent;
|
}
|
|
// 取前 60% 的关键词
|
private float keywordPercent = 0.4f;
|
private SemanticsManager semanticsManager;
|
}
|