时间:2025-02-28 11:10
人气:
作者:admin
机械图纸文章标题搜索增强实现过程
微信小程序名称 极客共享 输入搜索内容 有没有土豆分拣机

| 维度 | 传统全文检索(Elasticsearch) | 搜索增强(基于语义向量) |
|---|---|---|
| 技术原理 | 基于倒排索引和关键词匹配,依赖分词和词频统计(如 BM25)。 | 基于大模型生成语义嵌入向量,使用向量相似度(如余弦相似度)匹配。 |
| 语义理解 | 仅匹配关键词,缺乏语义理解。 | 理解标题和查询的语义,支持模糊匹配和同义词匹配。 |
| 查询灵活性 | 用户查询需与标题关键词高度一致,否则结果不准确。 | 支持模糊查询和不同表述的匹配(如“土豆分拣机”匹配“马铃薯筛选机”)。 |
| 专业术语处理 | 依赖分词器,专业术语可能被错误切分(如“土豆分拣机”被切为“土豆”和“分拣机”)。 | 通过预训练模型理解专业术语和同义词的语义,减少分词错误。 |
| 结果相关性 | 基于词频和位置排序,可能返回无关结果。 | 基于语义相似度排序,结果更相关。 |
| 实时性与性能 | 倒排索引查询速度快,但语义匹配需额外插件(如 Elasticsearch KNN)。 | 向量搜索需高效索引(如 RedisSearch),实时性稍逊但可优化。 |
| 适用场景 | 适合关键词明确、标题格式标准化的场景。 | 适合标题复杂、查询模糊或需语义理解的场景。 |
| 机械图纸标题搜索示例 | 查询“土豆分拣机”,仅匹配标题中包含“土豆分拣机”的图纸,遗漏“马铃薯筛选机”。 | 查询“土豆分拣机”,可匹配语义相似的标题,如“马铃薯筛选机”,因为 AI 模型理解“土豆”和“马铃薯”、“分拣”和“筛选”是同义词。 |

from FlagEmbedding import FlagModel
import pandas as pd
import numpy as np
from datasets import Dataset
from scipy.spatial import distance
import datetime
import configparser
import pymysql
model = None
def getModel():
global model
if model is None:
model = FlagModel("./model",
query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
use_fp16=True)
return model
#获取向量
def getFlagEmbedding(title):
global model
model = getModel()
embedding = model.encode(title)
return embedding
/// <summary>
/// 获取向量
/// </summary>
/// <param name="keyword"></param>
/// <returns></returns>
public async Task<double[]> GetFlagEmbedding(string keyword)
{
var vector = new double[] { };
vector = null;
try
{
var req = new
{
action = "getFlagEmbedding",
keyword
};
var content = new StringContent(
JsonSerializer.Serialize(req)
, Encoding.UTF8, "application/json");
var response = await _client.PostAsync(ConfigHelp.FlagSerachUrl, content);
if (response.IsSuccessStatusCode)
{
var result = await response.Content.ReadAsStringAsync();
var data = JsonSerializer.Deserialize<GetFlagEmbeddingRoot>(result);
if (data.op)
{
vector = data.msg.Split(',').Select(double.Parse).ToArray();
}
}
}
catch (Exception ex)
{
LogUtils.Error("GetFlagEmbedding ", ex);
}
return vector;
}
public class RedisVectorHelp
{
private readonly IDatabase _db;
private string _freefix;
private string _indexName;
private SearchCommands ft;
public RedisVectorHelp(string freefix,string redisConnectionString,int dbNum=0)
{
var redis = ConnectionMultiplexer.Connect(redisConnectionString);
_db = redis.GetDatabase(dbNum);
_freefix = freefix;
_indexName = _freefix + "_index";
ft = new SearchCommands(_db, null);
}
/// <summary>
/// 创建索引
/// </summary>
public void CreateFt()
{
var list = ft._List();
var indexList = list.Select(result => result.ToString()).ToArray();
//判断是否存在索引
if (indexList.Contains(_indexName))
{
Console.WriteLine("Index already exists.");
return;
}
ft.Create(_indexName,
new FTCreateParams()
.On(IndexDataType.HASH)
.Prefix(_freefix + ":"),
new Schema()
.AddTextField("id")
.AddVectorField("vector",
VectorField.VectorAlgo.FLAT,
new Dictionary<string, object>
{
["TYPE"] = "FLOAT32",
["DIM"] = 1024,
["DISTANCE_METRIC"] = "COSINE"
})
);
}
/// <summary>
/// 存储向量
/// </summary>
/// <param name="id"></param>
/// <param name="vector"></param>
public void StoreVectorData(string id, float[] vector)
{
// 构造键名
var key = $"{_freefix}:{id}";
VectorDom dom = new VectorDom
{
id = id,
vector = vector
};
byte[] vectorBinary = vector.SelectMany(f => BitConverter.GetBytes(f)).ToArray();
_db.HashSet(key, "id", dom.id);
_db.HashSet(key, "vector", vectorBinary);
}
/// <summary>
/// 向量搜索
/// </summary>
/// <param name="queryVector"></param>
/// <param name="topK"></param>
public List<string> SearchSimilarVectors(float[] queryVector, int topK = 50)
{
byte[] vectorQueryBinary = queryVector.SelectMany(f => BitConverter.GetBytes(f)).ToArray();
//十六进制字符串
//string vectorQueryBinaryStr = BitConverter.ToString(vectorQueryBinary).Replace("-", "");
Query q = new Query($"*=>[KNN {topK} @vector $vec as score]");
q.SortBy = "score";
q.AddParam("vec", vectorQueryBinary);
q.ReturnFields("id", "vector");
q.Limit(0, topK);
q.Dialect(2);
var obj = ft.Search(_indexName, q);
var docList = obj.Documents;
var list = new List<string>();
foreach (var doc in docList)
{
list.Add(doc["id"]);
}
return list;
}
}
public class VectorDom
{
public string id { get; set; }
public float[] vector { get; set; }
}
var searchVector = await GetFlagEmbedding(keyword);
if (searchVector != null)
{
var queryVector = Array.ConvertAll(searchVector, x => (float)x);
var temp = bykcsjRVHelp.SearchSimilarVectors(queryVector, 30);
foreach (var id in temp)
{
if (!ids.Contains(id))
{
ids.Add(id);
}
}
}
Microsoft Agent Framework Skills 执行 Scripts(实
EF Core 原生 SQL 实战:FromSql、SqlQuery 与对