Skip to content

vad

import torch
import numpy as np
import sounddevice as sd
import time  # 确保导入 time 模块

# ================= 配置参数 =================
SAMPLE_RATE = 16000  # 采样率
BLOCK_SIZE = 512  # 每次处理 512 采样点 (32ms @ 16kHz)

# 双阈值配置
THRESHOLD_HIGH = 0.5  # 确认为“有声音”的阈值
THRESHOLD_LOW = 0.2  # 确认为“无声音”的阈值

# 防抖动配置
MIN_SPEECH_FRAMES = 3  # 需要连续多少帧 (>32ms*3=96ms) 判定为有声

# 静默检测配置
SILENCE_DURATION_MS = 800  # 持续静音多少毫秒后,判定为“说完一句话”

# 模型路径
MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'

print("正在加载 Silero VAD 模型...")
try:
    model, utils = torch.hub.load(
        repo_or_dir=MODEL_PATH,
        source="local",
        model='silero_vad',
        force_reload=False,
        trust_repo=True
    )
    print("模型加载完成!")
except Exception as e:
    print(f"模型加载失败: {e}")
    exit(1)

print("-" * 30)
print("请开始说话...")
print("-" * 30)


# ================= 状态变量 =================
class VadState:
    def __init__(self):
        self.is_speaking = False  # 当前是否处于“说话中”状态
        self.consecutive_voice_frames = 0  # 连续检测到声音的帧数
        self.silence_start_time = None  # 开始静音的时间戳 (使用 time.time())


state = VadState()


def audio_callback(indata, frames, time_info, status):
    """
    注意:第三个参数我们改名为 time_info 以避免和 time 模块混淆,
    且不再尝试从中读取时间,直接使用 time.time()
    """
    if status:
        # 只在控制台打印警告,不中断程序
        # print(f"状态警告: {status}")
        pass

    try:
        # 1. 数据预处理
        # 确保取的是单声道,如果是立体声则取左声道 [:, 0]
        if indata.shape[1] > 1:
            audio_data = indata[:, 0].astype(np.float32)
        else:
            audio_data = indata[:, 0].astype(np.float32)

        audio_tensor = torch.from_numpy(audio_data)

        # 2. 模型推理
        with torch.no_grad():
            speech_prob = model(audio_tensor, SAMPLE_RATE).item()

        # 3. 双阈值逻辑 (Hysteresis)
        current_frame_is_voice = False
        if speech_prob >= THRESHOLD_HIGH:
            current_frame_is_voice = True
        elif speech_prob <= THRESHOLD_LOW:
            current_frame_is_voice = False
        else:
            # 在阈值之间,保持上一帧的状态
            current_frame_is_voice = state.is_speaking

        # 4. 状态机处理
        # 【修复点】直接使用 Python 的 time.time(),不依赖回调参数的 time_info
        now_ms = time.time() * 1000

        if current_frame_is_voice:
            # --- 如果当前帧判定为有声音 ---
            state.consecutive_voice_frames += 1

            # 如果之前不是说话状态,且连续帧数达标 -> 触发“开始说话”
            if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
                state.is_speaking = True
                state.silence_start_time = None  # 重置静音计时
                print(f">>> [START] 检测到语音 (概率: {speech_prob:.2f})")

            # 如果已经在说话状态,只要有一帧是有声,就重置静音计时
            if state.is_speaking:
                state.silence_start_time = None

        else:
            # --- 如果当前帧判定为无声音 ---
            state.consecutive_voice_frames = 0  # 重置连续计数

            if state.is_speaking:
                # 如果之前是说话状态,现在变静音了 -> 开始计时
                if state.silence_start_time is None:
                    state.silence_start_time = now_ms

                # 计算静音持续时长
                silence_duration = now_ms - state.silence_start_time

                if silence_duration >= SILENCE_DURATION_MS:
                    # 静音时间足够长 -> 触发“说完一句话”
                    state.is_speaking = False
                    state.silence_start_time = None
                    print(f"<<< [END] 句子结束 (静默 {silence_duration:.0f}ms)")

    except Exception as e:
        # 在回调中捕获异常并打印,防止 sounddevice 崩溃
        print(f"回调内部错误: {e}")


# 5. 启动麦克风流
try:
    with sd.InputStream(
            samplerate=SAMPLE_RATE,
            channels=1,
            blocksize=BLOCK_SIZE,
            dtype=np.float32,
            callback=audio_callback
    ):
        print("监听中... (按 Ctrl+C 停止)")
        while True:
            sd.sleep(1000)
except KeyboardInterrupt:
    print("\n程序已终止。")
except Exception as e:
    print(f"启动错误: {e}")
    print("提示:请检查麦克风是否被占用或权限设置。")

vad & asr & tts

import torch
import numpy as np
import sounddevice as sd
import time
import sys
import os
import threading
from funasr import AutoModel
import pyttsx3

# ================= 1. 配置区域 =================

# --- 音频参数 ---
SAMPLE_RATE = 16000  # Silero 和 SenseVoice 通常推荐 16kHz
BLOCK_SIZE = 512  # 每次处理 512 采样点 (~32ms)

# --- VAD 参数 (Silero) ---
THRESHOLD_HIGH = 0.5  # 确认为“有声音”
THRESHOLD_LOW = 0.2  # 确认为“无声音”
MIN_SPEECH_FRAMES = 3  # 连续 3 帧 (>96ms) 判定为开始说话
SILENCE_DURATION_MS = 800  # 持续静音 800ms 判定为说完

# --- 路径配置 ---
# Silero VAD 模型路径 (你提供的本地路径)
VAD_MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'
# 注意:你代码中写的是 snakers4_silero-vad,请确保文件夹名正确,通常是 silero_vad 或包含 model.jit 的文件夹

# FunASR 模型路径
ASR_MODEL_DIR = r"C:\Users\wjn\yx\hello-xiaoxin\models\asr\SenseVoiceSmall"

# ================= 2. 全局变量与状态 =================

# 音频缓冲区:用于存储当前这句话的音频数据
audio_buffer = []
is_recording = False  # 是否正在录制当前句子

# 加载模型的全局对象
vad_model = None
vad_utils = None
asr_model = None
tts_engine = None


class VadState:
    def __init__(self):
        self.is_speaking = False
        self.consecutive_voice_frames = 0
        self.silence_start_time = None


state = VadState()


# ================= 3. 模型初始化函数 =================

def load_models():
    global vad_model, vad_utils, asr_model, tts_engine

    print("🚀 正在初始化系统模型...")

    # 1. 加载 Silero VAD
    print(f"👂 加载 VAD 模型: {VAD_MODEL_PATH}")
    try:
        # 使用 torch.hub 加载本地模型
        # 注意:source="local" 时,repo_or_dir 应指向包含 hubconf.py 的目录或直接是模型文件
        # 如果是 silero 官方结构,通常直接加载 'silero_vad'
        model, utils = torch.hub.load(
            repo_or_dir=VAD_MODEL_PATH,
            source="local",
            model='silero_vad',
            force_reload=False,
            trust_repo=True
        )
        vad_model, vad_utils = model, utils
        print("✅ VAD 模型加载成功")
    except Exception as e:
        print(f"❌ VAD 加载失败: {e}")
        print("💡 请检查路径是否包含 silero_vad 的 jit 模型文件或 hubconf.py")
        sys.exit(1)

    # 2. 加载 FunASR
    print(f"🧠 加载 ASR 模型: {ASR_MODEL_DIR}")
    try:
        asr_model = AutoModel(
            model=ASR_MODEL_DIR,
            vad_model="fsmn-vad",  # 内部 VAD 可以关闭或保留作为二次确认,这里主要依赖外部 Silero
            vad_kwargs={"max_single_segment_time": 30000},
            device="cpu",
            disable_update=True
        )
        print("✅ ASR 模型加载成功")
    except Exception as e:
        print(f"❌ ASR 加载失败: {e}")
        sys.exit(1)

    # 3. 初始化 TTS
    print("🗣️  初始化 TTS 引擎...")
    tts_engine = pyttsx3.init()
    tts_engine.setProperty('rate', 150)
    tts_engine.setProperty('volume', 1.0)

    # 尝试设置中文声音
    voices = tts_engine.getProperty('voices')
    for voice in voices:
        if 'zh' in voice.id.lower() or 'chinese' in voice.name.lower():
            tts_engine.setProperty('voice', voice.id)
            print(f"✅ 已设置中文语音: {voice.name}")
            break
    print("✅ 系统初始化完成!\n")


# ================= 4. 核心业务逻辑 =================

def process_sentence(audio_data_np):
    """
    当 VAD 检测到一句话结束时,调用此函数:
    1. 调用 FunASR 识别
    2. 调用 pyttsx3 朗读
    """
    global audio_buffer

    if len(audio_data_np) == 0:
        return

    print(f"\n🔍 正在识别 ({len(audio_data_np) / 16000:.2f}秒音频)...")

    try:
        # FunASR 需要 float32 numpy array 或 list
        # 输入形状通常为 [samples] 或 [1, samples],SenseVoice 通常接受一维
        res = asr_model.generate(input=[audio_data_np], cache={}, batch_size_s=0)

        if res and len(res) > 0:
            text = res[0].get("text", "").strip()
            if not text:
                print("⚠️ 识别结果为空")
                return

            print(f"✨ 识别结果: {text}")

            # 朗读
            print("🔊 正在朗读...")
            tts_engine.say(text)
            tts_engine.runAndWait()
            print("✅ 朗读结束,继续监听...\n")
        else:
            print("⚠️ ASR 未返回有效结果")

    except Exception as e:
        print(f"❌ 识别或朗读出错: {e}")


# ================= 5. 音频回调函数 =================

def audio_callback(indata, frames, time_info, status):
    """
    SoundDevice 的实时回调
    """
    global audio_buffer, is_recording

    if status:
        # 忽略轻微的状态警告
        pass

    try:
        # 1. 数据预处理 (取单声道,转 float32)
        if indata.shape[1] > 1:
            audio_data = indata[:, 0].astype(np.float32)
        else:
            audio_data = indata[:, 0].astype(np.float32)

        audio_tensor = torch.from_numpy(audio_data)

        # 2. Silero VAD 推理
        with torch.no_grad():
            # get_speech_timestamps 返回复杂结构,这里直接用模型输出概率
            # silero_vad 模型直接调用返回 prob
            speech_prob = vad_model(audio_tensor, SAMPLE_RATE).item()

        # 3. 双阈值逻辑
        current_frame_is_voice = False
        if speech_prob >= THRESHOLD_HIGH:
            current_frame_is_voice = True
        elif speech_prob <= THRESHOLD_LOW:
            current_frame_is_voice = False
        else:
            current_frame_is_voice = state.is_speaking

        # 4. 状态机
        now_ms = time.time() * 1000

        if current_frame_is_voice:
            state.consecutive_voice_frames += 1

            # 触发 "开始说话"
            if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
                state.is_speaking = True
                state.silence_start_time = None
                audio_buffer = []  # 清空旧缓冲,开始新录音
                is_recording = True
                print(f">>> [START] 检测到语音 (Prob: {speech_prob:.2f})")

            # 录音中
            if is_recording:
                audio_buffer.append(audio_data.copy())  # 存入缓冲

            # 只要在说话,重置静音计时
            if state.is_speaking:
                state.silence_start_time = None

        else:
            state.consecutive_voice_frames = 0

            if state.is_speaking:
                # 开始计时静音
                if state.silence_start_time is None:
                    state.silence_start_time = now_ms

                silence_duration = now_ms - state.silence_start_time

                # 触发 "说完话"
                if silence_duration >= SILENCE_DURATION_MS:
                    state.is_speaking = False
                    state.silence_start_time = None
                    is_recording = False
                    print(f"<<< [END] 句子结束 (Silence: {silence_duration:.0f}ms)")

                    # --- 关键动作:处理音频 ---
                    if len(audio_buffer) > 0:
                        full_audio = np.concatenate(audio_buffer)
                        # 注意:process_sentence 可能会耗时,为了不让回调阻塞太久
                        # 在实际生产中建议放入队列由后台线程处理。
                        # 这里为了简单直接调用,如果识别慢,麦克风会暂停接收新数据直到本函数返回
                        # 但由于我们在回调外没有显式停止 stream,sounddevice 的缓冲区可能会满
                        # 改进方案:在回调中只标记标志位,主循环处理;或者这里直接处理并接受短暂延迟
                        process_sentence(full_audio)
                    else:
                        print("⚠️ 缓冲区为空,跳过识别")

    except Exception as e:
        print(f"❌ 回调错误: {e}")


# ================= 6. 主程序入口 =================

def main():
    load_models()

    print("-" * 30)
    print("🎙️  系统已就绪,请开始说话...")
    print("   (按 Ctrl+C 退出)")
    print("-" * 30)

    try:
        # 启动音频流
        with sd.InputStream(
                samplerate=SAMPLE_RATE,
                channels=1,
                blocksize=BLOCK_SIZE,
                dtype=np.float32,
                callback=audio_callback
        ):
            while True:
                sd.sleep(1000)  # 保持主线程运行
    except KeyboardInterrupt:
        print("\n👋 程序已终止。")
    except Exception as e:
        print(f"❌ 启动失败: {e}")
        print("💡 请检查麦克风权限或是否被其他程序占用。")


if __name__ == "__main__":
    main()

tts 不能用

import torch
import numpy as np
import sounddevice as sd
import time
import threading
import queue
import re
from funasr import AutoModel
import pyttsx3

# ================= 1. 配置 =================

SAMPLE_RATE = 16000
BLOCK_SIZE = 512

THRESHOLD_HIGH = 0.5
THRESHOLD_LOW = 0.2
MIN_SPEECH_FRAMES = 3
SILENCE_DURATION_MS = 600

VAD_MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'
ASR_MODEL_DIR = r"C:\Users\wjn\yx\hello-xiaoxin\models\asr\SenseVoiceSmall"

# ================= 2. 全局变量 =================

audio_queue = queue.Queue()
tts_queue = queue.Queue()

vad_model = None
asr_model = None
tts_engine = None

audio_buffer = []
is_recording = False


class VadState:
    def __init__(self):
        self.is_speaking = False
        self.consecutive_voice_frames = 0
        self.silence_start_time = None


state = VadState()

# ================= 3. 模型初始化 =================

def load_models():
    global vad_model, asr_model, tts_engine

    print("🚀 初始化模型...")

    # VAD
    model, utils = torch.hub.load(
        repo_or_dir=VAD_MODEL_PATH,
        source="local",
        model='silero_vad',
        force_reload=False,
        trust_repo=True
    )
    vad_model = model
    print("✅ VAD 加载完成")

    # ASR
    asr_model = AutoModel(
        model=ASR_MODEL_DIR,
        vad_model="fsmn-vad",
        vad_kwargs={"max_single_segment_time": 30000},
        device="cpu",
        disable_update=True
    )
    print("✅ ASR 加载完成")

    # TTS
    tts_engine = pyttsx3.init()
    tts_engine.setProperty('rate', 150)
    tts_engine.setProperty('volume', 1.0)
    print("✅ TTS 初始化完成")


# ================= 4. 文本清理 =================

def clean_text(text):
    # 去除 <|xxx|> 标签
    text = re.sub(r"<\|.*?\|>", "", text)
    return text.strip()


# ================= 5. ASR线程 =================

def asr_thread():
    print("🧠 ASR线程启动")

    while True:
        audio_data = audio_queue.get()

        if audio_data is None:
            break

        try:
            print(f"\n🔍 识别中 ({len(audio_data)/16000:.2f}秒)...")

            res = asr_model.generate(
                input=[audio_data],
                cache={},
                batch_size_s=0
            )

            if res and len(res) > 0:
                raw_text = res[0].get("text", "").strip()
                text = clean_text(raw_text)

                if text:
                    print(f"✨ 识别结果: {text}")
                    tts_queue.put(text)
                else:
                    print("⚠️ 空识别结果")

        except Exception as e:
            print(f"❌ ASR错误: {e}")

        finally:
            audio_queue.task_done()


# ================= 6. TTS线程 =================

def tts_thread():
    print("🔊 TTS线程启动")

    while True:
        text = tts_queue.get()

        if text is None:
            break

        try:
            print("🔊 朗读中...")
            tts_engine.say(text)
            tts_engine.runAndWait()
            print("✅ 朗读完成")

        except Exception as e:
            print(f"❌ TTS错误: {e}")

        finally:
            tts_queue.task_done()


# ================= 7. 音频回调 =================

def audio_callback(indata, frames, time_info, status):
    global audio_buffer, is_recording

    try:
        audio_data = indata[:, 0].astype(np.float32)
        audio_tensor = torch.from_numpy(audio_data)

        with torch.no_grad():
            speech_prob = vad_model(audio_tensor, SAMPLE_RATE).item()

        if speech_prob >= THRESHOLD_HIGH:
            current_voice = True
        elif speech_prob <= THRESHOLD_LOW:
            current_voice = False
        else:
            current_voice = state.is_speaking

        now_ms = time.time() * 1000

        if current_voice:
            state.consecutive_voice_frames += 1

            if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
                state.is_speaking = True
                state.silence_start_time = None
                audio_buffer.clear()
                is_recording = True
                print(">>> [START]")

            if is_recording:
                audio_buffer.append(audio_data.copy())

        else:
            state.consecutive_voice_frames = 0

            if state.is_speaking:
                if state.silence_start_time is None:
                    state.silence_start_time = now_ms

                silence_duration = now_ms - state.silence_start_time

                if silence_duration >= SILENCE_DURATION_MS:
                    state.is_speaking = False
                    state.silence_start_time = None
                    is_recording = False
                    print("<<< [END]")

                    if len(audio_buffer) > 0:
                        full_audio = np.concatenate(audio_buffer)
                        audio_queue.put(full_audio)

    except Exception as e:
        print(f"❌ 回调异常: {e}")


# ================= 8. 主函数 =================

def main():
    load_models()

    threading.Thread(target=asr_thread, daemon=True).start()
    threading.Thread(target=tts_thread, daemon=True).start()

    print("🎙️ 系统就绪,请说话...")

    try:
        with sd.InputStream(
            samplerate=SAMPLE_RATE,
            channels=1,
            blocksize=BLOCK_SIZE,
            dtype=np.float32,
            callback=audio_callback
        ):
            while True:
                sd.sleep(1000)

    except KeyboardInterrupt:
        print("\n👋 已退出")
        audio_queue.put(None)
        tts_queue.put(None)


if __name__ == "__main__":
    main()
☁️ 部署建议
如果你打算长期运行项目(博客 / API / 自动化脚本),建议直接用云服务器,会比本地稳定很多。
👉 查看云服务器(新用户优惠)