Skip to content

Llm

llama.cpp 是一个用于在 C/C++ 中进行大语言模型(LLM)推理的工具。其主要目标是通过最小化设置和在各种硬件上实现最先进的性能来进行 LLM 推理

llama-cpp-python

from llama_cpp import Llama

# 1. 初始化模型 (只需做一次)
# n_gpu_layers:
#   - AMD/Intel Win: 尝试设置为 -1 (全卸载到 GPU) 或具体层数
#   - 如果显存不足,它会自动回退到 CPU,不会崩溃
llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,  # 上下文窗口
    n_threads=4,  # CPU 线程数
    n_gpu_layers=-1,  # 关键:尝试全部卸载到 GPU (Vulkan/DirectML)
    verbose=False  # 生产环境关闭日志
)


# 2. 定义 RAG Prompt 模板 (根据模型调整)
def create_prompt(context, question):
    return f"""<|im_start|>system
你是一个机器狗助手。请根据以下已知信息回答用户问题。如果不知道,就说不知道。
已知信息:{context}
<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""


# 3. 推理函数 (支持流式)
def run_inference(question, context=""):
    prompt = create_prompt(context, question)

    # 流式生成
    output_stream = llm(
        prompt,
        max_tokens=512,
        stop=["<|im_end|>", "user:"],
        stream=True,
        temperature=0.7
    )

    full_response = ""
    for token in output_stream:
        content = token['choices'][0]['text']
        full_response += content
        print(content, end="", flush=True)  # 实时打印或直接送 TTS

    return full_response


# 测试
if __name__ == "__main__":
    print("模型加载完毕,等待指令...")
    run_inference("机器狗怎么充电?", "机器狗通过底部的磁吸触点充电,耗时约2小时。")
from llama_cpp import Llama

# =============================
# 1. 初始化模型(只加载一次)
# =============================
print("正在加载模型,请稍候...")

llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,
    n_threads=4,
    n_gpu_layers=-1,
    verbose=False
)

print("模型加载完毕,可以开始对话。输入 exit 退出。")


# =============================
# 2. Prompt 模板
# =============================
def create_prompt(context, question):
    return f"""<|im_start|>system
你是一个机器狗助手。请根据以下已知信息回答用户问题。如果不知道,就说不知道。
已知信息:{context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""


# =============================
# 3. 推理函数(流式输出)
# =============================
def run_inference(question, context=""):
    prompt = create_prompt(context, question)

    output_stream = llm(
        prompt,
        max_tokens=512,
        stop=["<|im_end|>", "<|im_start|>"],
        stream=True,
        temperature=0.7
    )

    full_response = ""

    for token in output_stream:
        content = token["choices"][0]["text"]
        full_response += content
        print(content, end="", flush=True)

    print("\n")  # 换行
    return full_response


# =============================
# 4. 终端循环输入
# =============================
if __name__ == "__main__":
    context = "机器狗通过底部的磁吸触点充电,耗时约2小时。"

    while True:
        user_input = input("你: ")

        if user_input.lower() in ["exit", "quit"]:
            print("已退出。")
            break

        print("助手: ", end="")
        run_inference(user_input, context)

记忆版

from llama_cpp import Llama

# =============================
# 1. 初始化模型(只加载一次)
# =============================
print("正在加载模型,请稍候...")

llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,
    n_threads=4,
    n_gpu_layers=-1,
    verbose=False
)

print("模型加载完毕,可以开始对话。输入 exit 退出。")


# =============================
# 2. 对话记忆容器
# =============================
history = []
MAX_HISTORY_TURNS = 10   # 最多保留 10 轮对话


# =============================
# 3. 构建 Prompt
# =============================
def build_prompt(history, context, new_question):
    system_prompt = f"""<|im_start|>system
你是一个智能助手。
如果已知信息中包含答案,请优先使用。
如果没有,可以使用你的常识回答。
已知信息:{context}
<|im_end|>
"""

    conversation = ""

    for user_msg, assistant_msg in history:
        conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
        conversation += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"

    conversation += f"<|im_start|>user\n{new_question}<|im_end|>\n"
    conversation += "<|im_start|>assistant\n"

    return system_prompt + conversation


# =============================
# 4. 推理函数(流式)
# =============================
def run_inference(question, context=""):
    global history

    prompt = build_prompt(history, context, question)

    output_stream = llm(
        prompt,
        max_tokens=512,
        stop=["<|im_end|>", "<|im_start|>"],
        stream=True,
        temperature=0.7
    )

    full_response = ""

    for token in output_stream:
        content = token["choices"][0]["text"]
        full_response += content
        print(content, end="", flush=True)

    print("\n")

    # 保存记忆
    history.append((question, full_response.strip()))

    # 控制历史长度
    if len(history) > MAX_HISTORY_TURNS:
        history = history[-MAX_HISTORY_TURNS:]

    return full_response


# =============================
# 5. 终端循环
# =============================
if __name__ == "__main__":

    context = "机器狗通过底部的磁吸触点充电,耗时约2小时。"

    while True:
        user_input = input("你: ")

        if user_input.lower() in ["exit", "quit"]:
            print("已退出。")
            break

        print("助手: ", end="")
        run_inference(user_input, context)

🧠 带向量数据库 ⚡ 支持长期对话记忆 🎯 防幻觉优化版 Prompt

uv add faiss-cpu sentence-transformers

import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer


# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
    def __init__(self, index_path="faiss.index"):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.dimension = 384
        self.index_path = index_path

        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)
            self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
        else:
            self.index = faiss.IndexFlatL2(self.dimension)
            self.texts = []

    def save(self):
        faiss.write_index(self.index, self.index_path)
        np.save("faiss_texts.npy", np.array(self.texts, dtype=object))

    def add(self, text):
        embedding = self.model.encode([text])
        self.index.add(np.array(embedding).astype("float32"))
        self.texts.append(text)
        self.save()

    def search(self, query, k=3):
        if len(self.texts) == 0:
            return []

        embedding = self.model.encode([query])
        D, I = self.index.search(np.array(embedding).astype("float32"), k)
        return [self.texts[i] for i in I[0] if i < len(self.texts)]


# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
    def __init__(self, max_short=6):
        self.short_term = []
        self.long_term = ""
        self.max_short = max_short

    def add(self, user, assistant):
        self.short_term.append((user, assistant))
        if len(self.short_term) > self.max_short:
            self.summarize()

    def summarize(self):
        summary = ""
        for u, a in self.short_term:
            summary += f"用户:{u}\n助手:{a}\n"

        self.long_term += summary
        self.short_term = []

    def get_context(self):
        recent = ""
        for u, a in self.short_term:
            recent += f"用户:{u}\n助手:{a}\n"

        return self.long_term + "\n" + recent


# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
    return f"""<|im_start|>system
你是一个严谨的智能助手。

规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。

【知识库内容】
{rag_context}

【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""


# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")

llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,
    n_threads=4,
    n_gpu_layers=-1,
    verbose=False
)

print("模型加载完成!")


# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()

# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
    vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
    vector_store.add("机器狗支持语音控制和自动避障功能。")


# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
    # RAG 检索
    rag_results = vector_store.search(question)
    rag_context = "\n".join(rag_results)

    # 记忆
    memory_context = memory.get_context()

    # Prompt
    prompt = create_prompt(rag_context, memory_context, question)

    # 流式输出
    output_stream = llm(
        prompt,
        max_tokens=512,
        temperature=0.7,
        stop=["<|im_end|>"],
        stream=True
    )

    full_response = ""
    for token in output_stream:
        text = token["choices"][0]["text"]
        full_response += text
        print(text, end="", flush=True)

    print("\n")

    memory.add(question, full_response.strip())
    return full_response.strip()


# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
    print("本地 RAG Agent 启动成功!输入 exit 退出。\n")

    while True:
        q = input("你: ")

        if q.lower() in ["exit", "quit"]:
            print("已退出。")
            break

        print("助手: ", end="")
        run_inference(q)

llm & tts

import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer

import pyttsx3

# # 初始化引擎
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# # 尝试设置中文声音,如果没有则默认
# if len(voices) > 0:
#     # 这里可以根据实际系统调整索引,Windows 中文通常是 0 或 1
#     engine.setProperty('voice', voices[0].id)
# engine.setProperty('rate', 180) # 稍微调快一点,因为是一次性读句子
# engine.setProperty('volume', 1.0)


# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
    def __init__(self, index_path="faiss.index"):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.dimension = 384
        self.index_path = index_path

        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)
            self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
        else:
            self.index = faiss.IndexFlatL2(self.dimension)
            self.texts = []

    def save(self):
        faiss.write_index(self.index, self.index_path)
        np.save("faiss_texts.npy", np.array(self.texts, dtype=object))

    def add(self, text):
        embedding = self.model.encode([text])
        self.index.add(np.array(embedding).astype("float32"))
        self.texts.append(text)
        self.save()

    def search(self, query, k=3):
        if len(self.texts) == 0:
            return []

        embedding = self.model.encode([query])
        D, I = self.index.search(np.array(embedding).astype("float32"), k)
        return [self.texts[i] for i in I[0] if i < len(self.texts)]


# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
    def __init__(self, max_short=6):
        self.short_term = []
        self.long_term = ""
        self.max_short = max_short

    def add(self, user, assistant):
        self.short_term.append((user, assistant))
        if len(self.short_term) > self.max_short:
            self.summarize()

    def summarize(self):
        summary = ""
        for u, a in self.short_term:
            summary += f"用户:{u}\n助手:{a}\n"

        self.long_term += summary
        self.short_term = []

    def get_context(self):
        recent = ""
        for u, a in self.short_term:
            recent += f"用户:{u}\n助手:{a}\n"

        return self.long_term + "\n" + recent


# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
    return f"""<|im_start|>system
你是一个严谨的智能助手。

规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。

【知识库内容】
{rag_context}

【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""


# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")

llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,
    n_threads=4,
    n_gpu_layers=-1,
    verbose=False
)

print("模型加载完成!")


# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()

# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
    vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
    vector_store.add("机器狗支持语音控制和自动避障功能。")


def sepeak(content):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if len(voices) > 0:
        engine.setProperty('voice', voices[0].id)

    engine.setProperty('rate', 180)
    engine.setProperty('volume', 1.0)

    engine.say(content)
    engine.runAndWait()
    engine.stop()

# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
    # RAG 检索
    rag_results = vector_store.search(question)
    rag_context = "\n".join(rag_results)

    # 记忆
    memory_context = memory.get_context()

    # Prompt
    prompt = create_prompt(rag_context, memory_context, question)

    # 流式输出
    output_stream = llm(
        prompt,
        max_tokens=512,
        temperature=0.7,
        stop=["<|im_end|>"],
        stream=True
    )

    full_response = ""

    for token in output_stream:
        text = token["choices"][0]["text"]
        full_response += text
        print(text, end="", flush=True)

    print("\n")

    # 生成完成后再朗读
    sepeak(full_response.strip())

    memory.add(question, full_response.strip())
    return full_response.strip()




# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
    print("本地 RAG Agent 启动成功!输入 exit 退出。\n")

    while True:
        q = input("你: ")

        if q.lower() in ["exit", "quit"]:
            print("已退出。")
            break

        print("助手: ", end="")
        run_inference(q)

gui

import os
import faiss
import numpy as np
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import threading


import pyttsx3

# # 初始化引擎
# engine = pyttsx3.init()
# voices = engine.getProperty('voices')
# # 尝试设置中文声音,如果没有则默认
# if len(voices) > 0:
#     # 这里可以根据实际系统调整索引,Windows 中文通常是 0 或 1
#     engine.setProperty('voice', voices[0].id)
# engine.setProperty('rate', 180) # 稍微调快一点,因为是一次性读句子
# engine.setProperty('volume', 1.0)


# =========================
# 1️⃣ 向量数据库
# =========================
class VectorStore:
    def __init__(self, index_path="faiss.index"):
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        self.dimension = 384
        self.index_path = index_path

        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)
            self.texts = np.load("faiss_texts.npy", allow_pickle=True).tolist()
        else:
            self.index = faiss.IndexFlatL2(self.dimension)
            self.texts = []

    def save(self):
        faiss.write_index(self.index, self.index_path)
        np.save("faiss_texts.npy", np.array(self.texts, dtype=object))

    def add(self, text):
        embedding = self.model.encode([text])
        self.index.add(np.array(embedding).astype("float32"))
        self.texts.append(text)
        self.save()

    def search(self, query, k=3):
        if len(self.texts) == 0:
            return []

        embedding = self.model.encode([query])
        D, I = self.index.search(np.array(embedding).astype("float32"), k)
        return [self.texts[i] for i in I[0] if i < len(self.texts)]

    def rebuild_index(self):
        """重建索引(删除后使用)"""
        self.index = faiss.IndexFlatL2(self.dimension)
        if len(self.texts) > 0:
            embeddings = self.model.encode(self.texts)
            self.index.add(np.array(embeddings).astype("float32"))
        self.save()

    def delete(self, index):
        """删除指定索引的文本"""
        if 0 <= index < len(self.texts):
            del self.texts[index]
            self.rebuild_index()


# =========================
# 2️⃣ 记忆管理
# =========================
class MemoryManager:
    def __init__(self, max_short=6):
        self.short_term = []
        self.long_term = ""
        self.max_short = max_short

    def add(self, user, assistant):
        self.short_term.append((user, assistant))
        if len(self.short_term) > self.max_short:
            self.summarize()

    def summarize(self):
        summary = ""
        for u, a in self.short_term:
            summary += f"用户:{u}\n助手:{a}\n"

        self.long_term += summary
        self.short_term = []

    def get_context(self):
        recent = ""
        for u, a in self.short_term:
            recent += f"用户:{u}\n助手:{a}\n"

        return self.long_term + "\n" + recent


# =========================
# 3️⃣ 防幻觉 Prompt
# =========================
def create_prompt(rag_context, memory_context, question):
    return f"""<|im_start|>system
你是一个严谨的智能助手。

规则:
1. 优先使用【知识库内容】回答。
2. 如果知识库没有相关内容,可以使用常识。
3. 不允许编造数据。
4. 不确定就说“我不确定”。

【知识库内容】
{rag_context}

【历史对话】
{memory_context}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""


# =========================
# 4️⃣ 加载模型(只加载一次)
# =========================
print("正在加载模型...")

llm = Llama(
    model_path=r"C:\Users\wjn\.lmstudio\models\chaparro2001\Qwen3-4B-Instruct-2507-Q4_K_M-GGUF\qwen3-4b-instruct-2507-q4_k_m.gguf",
    n_ctx=4096,
    n_threads=4,
    n_gpu_layers=-1,
    verbose=False
)

print("模型加载完成!")


# =========================
# 5️⃣ 初始化模块
# =========================
vector_store = VectorStore()
memory = MemoryManager()

# 如果第一次运行,初始化知识库
if len(vector_store.texts) == 0:
    vector_store.add("机器狗通过底部磁吸触点充电,耗时约2小时。")
    vector_store.add("机器狗支持语音控制和自动避障功能。")


def sepeak(content):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if len(voices) > 0:
        engine.setProperty('voice', voices[0].id)

    engine.setProperty('rate', 180)
    engine.setProperty('volume', 1.0)

    engine.say(content)
    engine.runAndWait()
    engine.stop()

# =========================
# 6️⃣ 推理函数
# =========================
def run_inference(question):
    # RAG 检索
    rag_results = vector_store.search(question)
    rag_context = "\n".join(rag_results)

    # 记忆
    memory_context = memory.get_context()

    # Prompt
    prompt = create_prompt(rag_context, memory_context, question)

    # 流式输出
    output_stream = llm(
        prompt,
        max_tokens=512,
        temperature=0.7,
        stop=["<|im_end|>"],
        stream=True
    )

    full_response = ""

    for token in output_stream:
        text = token["choices"][0]["text"]
        full_response += text
        print(text, end="", flush=True)

    print("\n")

    # 生成完成后再朗读
    sepeak(full_response.strip())

    memory.add(question, full_response.strip())
    return full_response.strip()


import tkinter as tk
from tkinter import messagebox


class KnowledgeGUI:
    def __init__(self, vector_store):
        self.vector_store = vector_store

        self.root = tk.Tk()
        self.root.title("知识库管理")
        self.root.geometry("600x400")

        # 输入框
        self.entry = tk.Entry(self.root, width=60)
        self.entry.pack(pady=10)

        # 添加按钮
        tk.Button(self.root, text="添加知识", command=self.add_knowledge).pack()

        # 列表框
        self.listbox = tk.Listbox(self.root, width=80)
        self.listbox.pack(pady=10, fill=tk.BOTH, expand=True)

        # 删除按钮
        tk.Button(self.root, text="删除选中", command=self.delete_selected).pack()

        self.refresh_list()

    def refresh_list(self):
        self.listbox.delete(0, tk.END)
        for text in self.vector_store.texts:
            self.listbox.insert(tk.END, text)

    def add_knowledge(self):
        text = self.entry.get().strip()
        if text:
            self.vector_store.add(text)
            self.entry.delete(0, tk.END)
            self.refresh_list()
            messagebox.showinfo("成功", "知识已添加")

    def delete_selected(self):
        """
        删除知识后清空记忆
        :return:
        """
        selected = self.listbox.curselection()
        if selected:
            index = selected[0]
            self.vector_store.delete(index)
            memory.short_term = []  # 清空短期记忆
            memory.long_term = ""  # 清空长期记忆
            self.refresh_list()
            messagebox.showinfo("成功", "知识已删除")

    def run(self):
        self.root.mainloop()

# =========================
# 7️⃣ 终端循环
# =========================
if __name__ == "__main__":
    print("本地 RAG Agent 启动成功!输入 exit 退出。\n")

    # GUI 放到线程
    gui_thread = threading.Thread(
        target=lambda: KnowledgeGUI(vector_store).run(),
        daemon=True
    )
    gui_thread.start()

    # 主线程继续终端循环

    while True:
        q = input("你: ")

        if q.lower() in ["exit", "quit"]:
            print("已退出。")
            break

        print("助手: ", end="")
        run_inference(q)
☁️ 部署建议
如果你打算长期运行项目(博客 / API / 自动化脚本),建议直接用云服务器,会比本地稳定很多。
👉 查看云服务器(新用户优惠)