package com.iplatform.milvus;
|
|
import com.walker.infrastructure.utils.StringUtils;
|
import com.walker.support.milvus.DataSet;
|
import com.walker.support.milvus.DataType;
|
import com.walker.support.milvus.FieldType;
|
import com.walker.support.milvus.MetricType;
|
import com.walker.support.milvus.OperateService;
|
import com.walker.support.milvus.OutData;
|
import com.walker.support.milvus.Query;
|
import com.walker.support.milvus.Table;
|
import com.walker.support.milvus.engine.DefaultOperateService;
|
import org.slf4j.Logger;
|
import org.slf4j.LoggerFactory;
|
|
import java.util.ArrayList;
|
import java.util.Arrays;
|
import java.util.HashMap;
|
import java.util.List;
|
import java.util.Map;
|
|
public class MilvusEngine {
|
|
protected final transient Logger logger = LoggerFactory.getLogger(this.getClass());
|
|
public static final String TABLE_CHAT_SIMILAR = "chat_similar";
|
|
public MilvusEngine(String ip, int port){
|
DefaultOperateService service = new DefaultOperateService();
|
service.connect(ip, port);
|
this.operateService = service;
|
logger.info("connect milvus: {}:{}", ip, port);
|
}
|
|
public void close(){
|
if(this.operateService != null){
|
this.operateService.close();
|
}
|
}
|
|
/**
|
* 创建表:测试从聊天一键提取工单内容使用。
|
* <pre>
|
* 1) 从历史工单数据中,收集用户提问内容,整理到表中
|
* 2) 把这些数据通过向量转化,写入milvus数据库。
|
* </pre>
|
* @date 2024-03-28
|
*/
|
public void createChatSimilarTable(){
|
Table chatSimilarTable = new Table();
|
chatSimilarTable.setCollectionName(TABLE_CHAT_SIMILAR);
|
chatSimilarTable.setDescription("聊天提取工单摘要历史数据");
|
chatSimilarTable.setShardsNum(1);
|
chatSimilarTable.setDimension(768); // 这个是根据使用向量模型维度定的
|
|
// 设置字段
|
FieldType id = FieldType.newBuilder()
|
.withName("id").withPrimaryKey(true).withMaxLength(18).withDataType(DataType.Long).build();
|
FieldType title = FieldType.newBuilder()
|
.withName("title").withPrimaryKey(false).withMaxLength(180).withDataType(DataType.VarChar).build();
|
FieldType content = FieldType.newBuilder()
|
.withName("content").withPrimaryKey(false).withMaxLength(255).withDataType(DataType.VarChar).build();
|
FieldType answer = FieldType.newBuilder()
|
.withName("answer").withPrimaryKey(false).withMaxLength(255).withDataType(DataType.VarChar).build();
|
FieldType embedding = FieldType.newBuilder()
|
.withName("embedding").withPrimaryKey(false).withDataType(DataType.FloatVector).withDimension(768).build();
|
|
List<FieldType> fieldTypeList = new ArrayList<>(8);
|
fieldTypeList.add(id);
|
fieldTypeList.add(title);
|
fieldTypeList.add(content);
|
fieldTypeList.add(answer);
|
fieldTypeList.add(embedding);
|
chatSimilarTable.setFieldTypes(fieldTypeList);
|
|
this.operateService.createTable(chatSimilarTable);
|
logger.info("创建了 table = {}", chatSimilarTable.getCollectionName());
|
|
// 创建索引
|
this.operateService.createIndex(chatSimilarTable.getCollectionName()
|
, "embedding", "HNSW", "{\"nlist\":16384, \"efConstruction\":128, \"M\":8}", MetricType.NLP);
|
logger.info("创建了 index = {}", chatSimilarTable.getCollectionName() + "_index");
|
}
|
|
public void dropChatSimilarTable(){
|
this.operateService.dropTable("chat_similar");
|
}
|
|
public void insertTestData(){
|
DataSet dataSet = new DataSet();
|
dataSet.setTableName(TABLE_CHAT_SIMILAR);
|
|
List<List<Float>> vectorList = new ArrayList<>();
|
vectorList.add(Arrays.asList(mockVector));
|
vectorList.add(Arrays.asList(mockVector));
|
|
Map<String, List<?>> fieldMap = new HashMap();
|
fieldMap.put("id", Arrays.asList(new Long[]{1L, 2L}));
|
fieldMap.put("title", Arrays.asList(new String[]{"第一个标题", "第二个标题"}));
|
fieldMap.put("content", Arrays.asList(new String[]{"第一个内容", "2222"}));
|
fieldMap.put("answer", Arrays.asList(new String[]{"第一个答案", "22222222"}));
|
fieldMap.put("embedding", vectorList);
|
dataSet.setFields(fieldMap);
|
this.operateService.insertDataSet(dataSet);
|
logger.info("写入了测试数据: {}", dataSet);
|
}
|
|
public void insertEventVoList(List<EventVo> batchData){
|
if(StringUtils.isEmptyList(batchData)){
|
return;
|
}
|
List<Long> ids = new ArrayList<>(8);
|
List<String> titles = new ArrayList<>(8);
|
List<String> contents = new ArrayList<>(8);
|
List<String> answers = new ArrayList<>(8);
|
List<List<Float>> vectorSet = new ArrayList<>(8);
|
|
for(EventVo vo : batchData){
|
ids.add(vo.getId());
|
vectorSet.add(vo.getEmbedding());
|
if(StringUtils.isNotEmpty(vo.getTitle())){
|
titles.add(vo.getTitle());
|
} else {
|
titles.add("none");
|
}
|
if(StringUtils.isNotEmpty(vo.getAnswer())){
|
answers.add(vo.getAnswer());
|
} else {
|
answers.add("none");
|
}
|
if(vo.getContent().length() > 200){
|
contents.add(vo.getContent().substring(0, 200));
|
} else {
|
contents.add(vo.getContent());
|
}
|
}
|
|
DataSet dataSet = new DataSet();
|
dataSet.setTableName(TABLE_CHAT_SIMILAR);
|
|
Map<String, List<?>> fieldMap = new HashMap();
|
fieldMap.put("id", ids);
|
fieldMap.put("title", titles);
|
fieldMap.put("content", contents);
|
fieldMap.put("answer", answers);
|
fieldMap.put("embedding", vectorSet);
|
dataSet.setFields(fieldMap);
|
this.operateService.insertDataSet(dataSet);
|
logger.info("写入了: {}", ids);
|
}
|
|
public OutData searchChatSimilar(List<List<Float>> vectors){
|
Query query = new Query();
|
query.setMetricType(MetricType.NLP.getIndex());
|
query.setTableName(TABLE_CHAT_SIMILAR);
|
query.setTopK(4);
|
query.setVectorName("embedding");
|
query.setOutFieldList(Arrays.asList(new String[]{"id","title","content"}));
|
query.setFieldPrimaryKey("id");
|
query.setSearchVectors(vectors);
|
return this.operateService.searchVector(query);
|
}
|
|
/**
|
* 必须在查询之前,加载数据到内存中。
|
* @date 2024-03-31
|
*/
|
public void loadChatSimilar4Search(){
|
this.operateService.prepareSearch(TABLE_CHAT_SIMILAR);
|
}
|
|
private OperateService operateService;
|
|
// private Double[] mockVector = new Double[]{-0.051114246249198914, 0.889954432};
|
private Float[] mockVector = new Float[]{-0.051114246249198914f, 0.889954432f};
|
}
|