haystack
https://haystack.deepset.ai/overview/quick-start#installation
关键词检索(BM25)
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack import Document
document_store = InMemoryDocumentStore()
document_store.write_documents([
Document(content="Python 是一种编程语言"),
Document(content="Haystack 支持 RAG 架构")
])
retriever = InMemoryBM25Retriever(document_store=document_store)
result = retriever.run(
query="什么是 Haystack"
)
docs = result["documents"]
if not docs:
print("走 LLM")
else:
best_doc = docs[0]
if best_doc.score > 0.7:
# ✅ 命中,直接返回
print(best_doc.content)
else:
# ❌ 不够好,才走 LLM
print("调用 LLM")
向量检索
pip install "sentence-transformers>=4.1.0"
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Document
# 1. 文档
docs = [
Document(content="LangChain 是一个 LLM 应用开发框架"),
Document(content="Haystack 支持 RAG 架构"),
]
# 2. 文档库
document_store = InMemoryDocumentStore()
# 3. embedding 模型
doc_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
# 4. 计算文档 embedding
docs_with_embeddings = doc_embedder.run(docs)["documents"]
# 5. 写入数据库
document_store.write_documents(docs_with_embeddings)
# 6. Retriever
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
# 7. 查询(注意:query 也要 embedding)
from haystack.components.embedders import SentenceTransformersTextEmbedder
query_embedder = SentenceTransformersTextEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
query_embedding = query_embedder.run("什么是框架")["embedding"]
# 8. 检索
result = retriever.run(query_embedding=query_embedding)
print(result)
BM25 + 向量混合检索
用户问题
↓
BM25(关键词)
Embedding(语义)
↓
合并结果(融合 / 排序)
↓
判断是否命中
↓
命中 → 返回
否则 → LLM
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import (
InMemoryBM25Retriever,
InMemoryEmbeddingRetriever,
)
from haystack.components.embedders import (
SentenceTransformersDocumentEmbedder,
SentenceTransformersTextEmbedder,
)
from haystack import Document
# =========================
# 1. 准备数据
# =========================
docs = [
Document(content="LangChain 是一个 LLM 应用开发框架"),
Document(content="Haystack 是一个用于构建搜索和问答系统的框架"),
Document(content="RAG 是检索增强生成"),
Document(content="Python 是一种编程语言"),
]
# =========================
# 2. 文档存储
# =========================
document_store = InMemoryDocumentStore()
# =========================
# 4. Embedding 模型
# =========================
doc_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
query_embedder = SentenceTransformersTextEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
# 👉 文档 embedding
docs_with_embeddings = doc_embedder.run(docs)["documents"]
document_store.write_documents(docs_with_embeddings)
# 👉 embedding 检索器
embedding_retriever = InMemoryEmbeddingRetriever(
document_store=document_store
)
# =========================
# 3. BM25 检索器
# =========================
bm25 = InMemoryBM25Retriever(document_store=document_store)
# =========================
# 5. 混合检索函数
# =========================
def hybrid_retrieve(query):
# BM25
bm25_docs = bm25.run(
query=query
)["documents"]
# Embedding
query_emb = query_embedder.run(query)["embedding"]
emb_docs = embedding_retriever.run(
query_embedding=query_emb
)["documents"]
# 合并(去重 + 融合分数)
all_docs = {}
for d in bm25_docs:
all_docs[d.id] = d
for d in emb_docs:
if d.id in all_docs:
# 融合分数(权重可调)
all_docs[d.id].score = (
all_docs[d.id].score * 0.6 + d.score * 0.4
)
else:
all_docs[d.id] = d
# 排序
final_docs = sorted(
all_docs.values(),
key=lambda x: x.score if x.score else 0,
reverse=True
)
return final_docs
# =========================
# 6. QA逻辑(关键)
# =========================
def answer(query):
docs = hybrid_retrieve(query)
print("\n=== 检索结果 ===")
for d in docs:
print(f"{d.content} | score={d.score:.3f}")
if not docs:
return "❌ 未命中 → 调用LLM(这里省略)"
best = docs[0]
# 👉 命中阈值(关键)
if best.score and best.score > 0.65:
return f"✅ 命中知识库:{best.content}"
return "❌ 置信度低 → 调用LLM(这里省略)"
# =========================
# 7. 测试
# =========================
if __name__ == "__main__":
while True:
q = input("\n请输入问题:")
if q.lower() in ["exit", "quit"]:
break
res = answer(q)
print("\n=== 最终答案 ===")
print(res)