Llm
llama.cpp 是一个用于在 C/C++ 中进行大语言模型(LLM)推理的工具。其主要目标是通过最小化设置和在各种硬件上实现最先进的性能来进行 LLM 推理
llama-cpp-python
from llama_cpp import Llama
# 1. 初始化模型 (只需做一次)
# n_gpu_layers:
# - AMD/Intel Win: 尝试设置为 -1 (全卸载到 GPU) 或具体层数
# - 如果显存不足,它会自动回退到 CPU,不会崩溃
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096, # 上下文窗口
n_threads=4, # CPU 线程数
n_gpu_layers=-1, # 关键:尝试全部卸载到 GPU (Vulkan/DirectML)
verbose=False # 生产环境关闭日志
)
# 2. 定义 RAG Prompt 模板 (根据模型调整)
def create_prompt(context, question):
return f"""<|im_start|>system
你是一个机器狗助手。请根据以下已知信息回答用户问题。如果不知道,就说不知道。
已知信息:{context}
<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
# 3. 推理函数 (支持流式)
def run_inference(question, context=""):
prompt = create_prompt(context, question)
# 流式生成
output_stream = llm(
prompt,
max_tokens=512,
stop=["<|im_end|>", "user:"],
stream=True,
temperature=0.7
)
full_response = ""
for token in output_stream:
content = token['choices'][0]['text']
full_response += content
print(content, end="", flush=True) # 实时打印或直接送 TTS
return full_response
# 测试
if __name__ == "__main__":
print("模型加载完毕,等待指令...")
run_inference("机器狗怎么充电?", "机器狗通过底部的磁吸触点充电,耗时约2小时。")
from llama_cpp import Llama
# =============================
# 1. 初始化模型(只加载一次)
# =============================
print("正在加载模型,请稍候...")
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False
)
print("模型加载完毕,可以开始对话。输入 exit 退出。")
# =============================
# 2. Prompt 模板
# =============================
def create_prompt(context, question):
return f"""<|im_start|>system
你是一个机器狗助手。请根据以下已知信息回答用户问题。如果不知道,就说不知道。
已知信息:{context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
# =============================
# 3. 推理函数(流式输出)
# =============================
def run_inference(question, context=""):
prompt = create_prompt(context, question)
output_stream = llm(
prompt,
max_tokens=512,
stop=["<|im_end|>", "<|im_start|>"],
stream=True,
temperature=0.7
)
full_response = ""
for token in output_stream:
content = token["choices"][0]["text"]
full_response += content
print(content, end="", flush=True)
print("\n") # 换行
return full_response
# =============================
# 4. 终端循环输入
# =============================
if __name__ == "__main__":
context = "机器狗通过底部的磁吸触点充电,耗时约2小时。"
while True:
user_input = input("你: ")
if user_input.lower() in ["exit", "quit"]:
print("已退出。")
break
print("助手: ", end="")
run_inference(user_input, context)
记忆版
from llama_cpp import Llama
# =============================
# 1. 初始化模型(只加载一次)
# =============================
print("正在加载模型,请稍候...")
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False
)
print("模型加载完毕,可以开始对话。输入 exit 退出。")
# =============================
# 2. 对话记忆容器
# =============================
history = []
MAX_HISTORY_TURNS = 10 # 最多保留 10 轮对话
# =============================
# 3. 构建 Prompt
# =============================
def build_prompt(history, context, new_question):
system_prompt = f"""<|im_start|>system
你是一个智能助手。
如果已知信息中包含答案,请优先使用。
如果没有,可以使用你的常识回答。
已知信息:{context}
<|im_end|>
"""
conversation = ""
for user_msg, assistant_msg in history:
conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
conversation += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
conversation += f"<|im_start|>user\n{new_question}<|im_end|>\n"
conversation += "<|im_start|>assistant\n"
return system_prompt + conversation
# =============================
# 4. 推理函数(流式)
# =============================
def run_inference(question, context=""):
global history
prompt = build_prompt(history, context, question)
output_stream = llm(
prompt,
max_tokens=512,
stop=["<|im_end|>", "<|im_start|>"],
stream=True,
temperature=0.7
)
full_response = ""
for token in output_stream:
content = token["choices"][0]["text"]
full_response += content
print(content, end="", flush=True)
print("\n")
# 保存记忆
history.append((question, full_response.strip()))
# 控制历史长度
if len(history) > MAX_HISTORY_TURNS:
history = history[-MAX_HISTORY_TURNS:]
return full_response
# =============================
# 5. 终端循环
# =============================
if __name__ == "__main__":
context = "机器狗通过底部的磁吸触点充电,耗时约2小时。"
while True:
user_input = input("你: ")
if user_input.lower() in ["exit", "quit"]:
print("已退出。")
break
print("助手: ", end="")
run_inference(user_input, context)
🧠 带向量数据库 ⚡ 支持长期对话记忆 🎯 防幻觉优化版 Prompt
uv add faiss-cpu sentence-transformers
import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
def __init__(self, index_path="faiss.index"):
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.dimension = 384
self.index_path = index_path
if os.path.exists(index_path):
self.index = faiss.read_index(index_path)
self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
else:
self.index = faiss.IndexFlatL2(self.dimension)
self.texts = []
def save(self):
faiss.write_index(self.index, self.index_path)
np.save("faiss_texts.npy", np.array(self.texts, dtype=object))
def add(self, text):
embedding = self.model.encode([text])
self.index.add(np.array(embedding).astype("float32"))
self.texts.append(text)
self.save()
def search(self, query, k=3):
if len(self.texts) == 0:
return []
embedding = self.model.encode([query])
D, I = self.index.search(np.array(embedding).astype("float32"), k)
return [self.texts[i] for i in I[0] if i < len(self.texts)]
# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
def __init__(self, max_short=6):
self.short_term = []
self.long_term = ""
self.max_short = max_short
def add(self, user, assistant):
self.short_term.append((user, assistant))
if len(self.short_term) > self.max_short:
self.summarize()
def summarize(self):
summary = ""
for u, a in self.short_term:
summary += f"用户:{u}\n助手:{a}\n"
self.long_term += summary
self.short_term = []
def get_context(self):
recent = ""
for u, a in self.short_term:
recent += f"用户:{u}\n助手:{a}\n"
return self.long_term + "\n" + recent
# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
return f"""<|im_start|>system
你是一个严谨的智能助手。
规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。
【知识库内容】
{rag_context}
【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False
)
print("模型加载完成!")
# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()
# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
vector_store.add("机器狗支持语音控制和自动避障功能。")
# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
# RAG 检索
rag_results = vector_store.search(question)
rag_context = "\n".join(rag_results)
# 记忆
memory_context = memory.get_context()
# Prompt
prompt = create_prompt(rag_context, memory_context, question)
# 流式输出
output_stream = llm(
prompt,
max_tokens=512,
temperature=0.7,
stop=["<|im_end|>"],
stream=True
)
full_response = ""
for token in output_stream:
text = token["choices"][0]["text"]
full_response += text
print(text, end="", flush=True)
print("\n")
memory.add(question, full_response.strip())
return full_response.strip()
# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
print("本地 RAG Agent 启动成功!输入 exit 退出。\n")
while True:
q = input("你: ")
if q.lower() in ["exit", "quit"]:
print("已退出。")
break
print("助手: ", end="")
run_inference(q)
llm & tts
import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import pyttsx3
# # 初始化引擎
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# # 尝试设置中文声音,如果没有则默认
# if len(voices) > 0:
# # 这里可以根据实际系统调整索引,Windows 中文通常是 0 或 1
# engine.setProperty('voice', voices[0].id)
# engine.setProperty('rate', 180) # 稍微调快一点,因为是一次性读句子
# engine.setProperty('volume', 1.0)
# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
def __init__(self, index_path="faiss.index"):
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.dimension = 384
self.index_path = index_path
if os.path.exists(index_path):
self.index = faiss.read_index(index_path)
self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
else:
self.index = faiss.IndexFlatL2(self.dimension)
self.texts = []
def save(self):
faiss.write_index(self.index, self.index_path)
np.save("faiss_texts.npy", np.array(self.texts, dtype=object))
def add(self, text):
embedding = self.model.encode([text])
self.index.add(np.array(embedding).astype("float32"))
self.texts.append(text)
self.save()
def search(self, query, k=3):
if len(self.texts) == 0:
return []
embedding = self.model.encode([query])
D, I = self.index.search(np.array(embedding).astype("float32"), k)
return [self.texts[i] for i in I[0] if i < len(self.texts)]
# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
def __init__(self, max_short=6):
self.short_term = []
self.long_term = ""
self.max_short = max_short
def add(self, user, assistant):
self.short_term.append((user, assistant))
if len(self.short_term) > self.max_short:
self.summarize()
def summarize(self):
summary = ""
for u, a in self.short_term:
summary += f"用户:{u}\n助手:{a}\n"
self.long_term += summary
self.short_term = []
def get_context(self):
recent = ""
for u, a in self.short_term:
recent += f"用户:{u}\n助手:{a}\n"
return self.long_term + "\n" + recent
# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
return f"""<|im_start|>system
你是一个严谨的智能助手。
规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。
【知识库内容】
{rag_context}
【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False
)
print("模型加载完成!")
# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()
# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
vector_store.add("机器狗支持语音控制和自动避障功能。")
def sepeak(content):
engine = pyttsx3.init()
voices = engine.getProperty('voices')
if len(voices) > 0:
engine.setProperty('voice', voices[0].id)
engine.setProperty('rate', 180)
engine.setProperty('volume', 1.0)
engine.say(content)
engine.runAndWait()
engine.stop()
# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
# RAG 检索
rag_results = vector_store.search(question)
rag_context = "\n".join(rag_results)
# 记忆
memory_context = memory.get_context()
# Prompt
prompt = create_prompt(rag_context, memory_context, question)
# 流式输出
output_stream = llm(
prompt,
max_tokens=512,
temperature=0.7,
stop=["<|im_end|>"],
stream=True
)
full_response = ""
for token in output_stream:
text = token["choices"][0]["text"]
full_response += text
print(text, end="", flush=True)
print("\n")
# 生成完成后再朗读
sepeak(full_response.strip())
memory.add(question, full_response.strip())
return full_response.strip()
# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
print("本地 RAG Agent 启动成功!输入 exit 退出。\n")
while True:
q = input("你: ")
if q.lower() in ["exit", "quit"]:
print("已退出。")
break
print("助手: ", end="")
run_inference(q)
gui
import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import threading
import pyttsx3
# # 初始化引擎
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# # 尝试设置中文声音,如果没有则默认
# if len(voices) > 0:
# # 这里可以根据实际系统调整索引,Windows 中文通常是 0 或 1
# engine.setProperty('voice', voices[0].id)
# engine.setProperty('rate', 180) # 稍微调快一点,因为是一次性读句子
# engine.setProperty('volume', 1.0)
# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
def __init__(self, index_path="faiss.index"):
self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
self.dimension = 384
self.index_path = index_path
if os.path.exists(index_path):
self.index = faiss.read_index(index_path)
self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
else:
self.index = faiss.IndexFlatL2(self.dimension)
self.texts = []
def save(self):
faiss.write_index(self.index, self.index_path)
np.save("faiss_texts.npy", np.array(self.texts, dtype=object))
def add(self, text):
embedding = self.model.encode([text])
self.index.add(np.array(embedding).astype("float32"))
self.texts.append(text)
self.save()
def search(self, query, k=3):
if len(self.texts) == 0:
return []
embedding = self.model.encode([query])
D, I = self.index.search(np.array(embedding).astype("float32"), k)
return [self.texts[i] for i in I[0] if i < len(self.texts)]
def rebuild_index(self):
"""重建索引(删除后使用)"""
self.index = faiss.IndexFlatL2(self.dimension)
if len(self.texts) > 0:
embeddings = self.model.encode(self.texts)
self.index.add(np.array(embeddings).astype("float32"))
self.save()
def delete(self, index):
"""删除指定索引的文本"""
if 0 <= index < len(self.texts):
del self.texts[index]
self.rebuild_index()
# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
def __init__(self, max_short=6):
self.short_term = []
self.long_term = ""
self.max_short = max_short
def add(self, user, assistant):
self.short_term.append((user, assistant))
if len(self.short_term) > self.max_short:
self.summarize()
def summarize(self):
summary = ""
for u, a in self.short_term:
summary += f"用户:{u}\n助手:{a}\n"
self.long_term += summary
self.short_term = []
def get_context(self):
recent = ""
for u, a in self.short_term:
recent += f"用户:{u}\n助手:{a}\n"
return self.long_term + "\n" + recent
# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
return f"""<|im_start|>system
你是一个严谨的智能助手。
规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。
【知识库内容】
{rag_context}
【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")
llm = Llama(
model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
n_ctx=4096,
n_threads=4,
n_gpu_layers=-1,
verbose=False
)
print("模型加载完成!")
# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()
# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
vector_store.add("机器狗支持语音控制和自动避障功能。")
def sepeak(content):
engine = pyttsx3.init()
voices = engine.getProperty('voices')
if len(voices) > 0:
engine.setProperty('voice', voices[0].id)
engine.setProperty('rate', 180)
engine.setProperty('volume', 1.0)
engine.say(content)
engine.runAndWait()
engine.stop()
# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
# RAG 检索
rag_results = vector_store.search(question)
rag_context = "\n".join(rag_results)
# 记忆
memory_context = memory.get_context()
# Prompt
prompt = create_prompt(rag_context, memory_context, question)
# 流式输出
output_stream = llm(
prompt,
max_tokens=512,
temperature=0.7,
stop=["<|im_end|>"],
stream=True
)
full_response = ""
for token in output_stream:
text = token["choices"][0]["text"]
full_response += text
print(text, end="", flush=True)
print("\n")
# 生成完成后再朗读
sepeak(full_response.strip())
memory.add(question, full_response.strip())
return full_response.strip()
import tkinter as tk
from tkinter import messagebox
class KnowledgeGUI:
def __init__(self, vector_store):
self.vector_store = vector_store
self.root = tk.Tk()
self.root.title("知识库管理")
self.root.geometry("600x400")
# 输入框
self.entry = tk.Entry(self.root, width=60)
self.entry.pack(pady=10)
# 添加按钮
tk.Button(self.root, text="添加知识", command=self.add_knowledge).pack()
# 列表框
self.listbox = tk.Listbox(self.root, width=80)
self.listbox.pack(pady=10, fill=tk.BOTH, expand=True)
# 删除按钮
tk.Button(self.root, text="删除选中", command=self.delete_selected).pack()
self.refresh_list()
def refresh_list(self):
self.listbox.delete(0, tk.END)
for text in self.vector_store.texts:
self.listbox.insert(tk.END, text)
def add_knowledge(self):
text = self.entry.get().strip()
if text:
self.vector_store.add(text)
self.entry.delete(0, tk.END)
self.refresh_list()
messagebox.showinfo("成功", "知识已添加")
def delete_selected(self):
"""
删除知识后清空记忆
:return:
"""
selected = self.listbox.curselection()
if selected:
index = selected[0]
self.vector_store.delete(index)
memory.short_term = [] # 清空短期记忆
memory.long_term = "" # 清空长期记忆
self.refresh_list()
messagebox.showinfo("成功", "知识已删除")
def run(self):
self.root.mainloop()
# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
print("本地 RAG Agent 启动成功!输入 exit 退出。\n")
# GUI 放到线程
gui_thread = threading.Thread(
target=lambda: KnowledgeGUI(vector_store).run(),
daemon=True
)
gui_thread.start()
# 主线程继续终端循环
while True:
q = input("你: ")
if q.lower() in ["exit", "quit"]:
print("已退出。")
break
print("助手: ", end="")
run_inference(q)