package com.walker.semantics.support; import com.walker.infrastructure.utils.StringUtils; import com.walker.semantics.ExtractorException; import com.walker.semantics.InputWord; import com.walker.semantics.SemanticsManager; import com.walker.semantics.SummaryExtractor; import com.walker.semantics.SummaryMeta; import com.walker.semantics.SummaryQuery; import com.walker.semantics.WordKey; import org.ansj.app.keyword.KeyWordComputer; import org.ansj.app.keyword.Keyword; import org.ansj.app.summary.SummaryComputer; import org.ansj.app.summary.TagContent; import org.ansj.app.summary.pojo.Summary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.List; public abstract class AbstractSummaryExtractor implements SummaryExtractor { protected final transient Logger logger = LoggerFactory.getLogger(getClass()); @Override public SummaryMeta extract(SummaryQuery query) throws ExtractorException { if(query == null){ throw new ExtractorException("未提供抽取条件,无法完成摘要抽取"); } if(StringUtils.isEmpty(query.getContent())){ throw new ExtractorException("原始抽取素材不存在"); } if(query.getMinScore() <= 0){ throw new ExtractorException("关键词分值必须大于0"); } String input = query.getContent(); if(this.logger.isDebugEnabled()){ this.logger.debug(input); } InputWord inputWord = new InputWord(input); if(inputWord.getWordMetaList().size() <= 3){ throw new ExtractorException("输入内容过少", query.getContent(), true); } logger.debug(inputWord.getWordMetaList().toString()); String referenceKeywords = null; if(StringUtils.isNotEmpty(query.getKeywords())){ referenceKeywords = query.getKeywords(); } else { referenceKeywords = this.acquireKeywords(null, input); } logger.debug("referenceKeywords = {}", referenceKeywords); SummaryComputer summaryComputer = new SummaryComputer(query.getMaxLength(), null, input); Summary summary = summaryComputer.toSummary(referenceKeywords); List keywordList = summary.getKeyWords(); if(keywordList == null || keywordList.size() <= 1){ throw new ExtractorException("输入内容过少", query.getContent(), true); } double minScore = query.getMinScore(); StringBuilder title = new StringBuilder(); List wordKeyList = new ArrayList<>(keywordList.size()); for(Keyword keyword : keywordList){ if(keyword.getScore() >= minScore){ wordKeyList.add(new WordKey(keyword.getScore(), keyword.getName())); title.append(keyword.getName()); } } SummaryMeta summaryMeta = new SummaryMeta(); summaryMeta.setTitle(title.toString()); summaryMeta.setSummary(summary.getSummary()); summaryMeta.setWordKeyList(wordKeyList); // 如果存在标签,添加标签内容 if(StringUtils.isNotEmpty(query.getBeginTag()) && StringUtils.isNotEmpty(query.getEndTag())){ TagContent tagContent = new TagContent(query.getBeginTag(), query.getEndTag()); summaryMeta.setTagSummary(tagContent.tagContent(summary)); } return summaryMeta; } /** * 从内容中获取排名靠前的关键词集合,按照分值排名 * @param title 给定的参考标题,可选 * @param content 给定的原始内容素材 * @return 返回关键词字符串结果 */ private String acquireKeywords(String title, String content){ KeyWordComputer kwc = new KeyWordComputer(20); Collection result = kwc.computeArticleTfidf(title, content); if(result == null || result.size() == 0){ return StringUtils.EMPTY_STRING; } StringBuilder sb = new StringBuilder(); int size = result.size(); float currentSize = 0; for(Keyword kw : result){ if(currentSize == 0){ sb.append(kw.getName()); currentSize ++; continue; } if(currentSize/size < keywordPercent){ sb.append(kw.getName()); currentSize ++; continue; } else { break; } } return sb.toString(); } @Override public void setSemanticsManager(SemanticsManager semanticsManager) { this.semanticsManager = semanticsManager; } public SemanticsManager getSemanticsManager() { return semanticsManager; } public float getKeywordPercent() { return keywordPercent; } /** * 设置提取关键词中的百分比,不能使用全部关键词,只取前百分比部分的。 * @param keywordPercent */ public void setKeywordPercent(float keywordPercent) { this.keywordPercent = keywordPercent; } // 取前 60% 的关键词 private float keywordPercent = 0.4f; private SemanticsManager semanticsManager; }