vad
import torch
import numpy as np
import sounddevice as sd
import time # 确保导入 time 模块
# ================= 配置参数 =================
SAMPLE_RATE = 16000 # 采样率
BLOCK_SIZE = 512 # 每次处理 512 采样点 (32ms @ 16kHz)
# 双阈值配置
THRESHOLD_HIGH = 0.5 # 确认为“有声音”的阈值
THRESHOLD_LOW = 0.2 # 确认为“无声音”的阈值
# 防抖动配置
MIN_SPEECH_FRAMES = 3 # 需要连续多少帧 (>32ms*3=96ms) 判定为有声
# 静默检测配置
SILENCE_DURATION_MS = 800 # 持续静音多少毫秒后,判定为“说完一句话”
# 模型路径
MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'
print("正在加载 Silero VAD 模型...")
try:
model, utils = torch.hub.load(
repo_or_dir=MODEL_PATH,
source="local",
model='silero_vad',
force_reload=False,
trust_repo=True
)
print("模型加载完成!")
except Exception as e:
print(f"模型加载失败: {e}")
exit(1)
print("-" * 30)
print("请开始说话...")
print("-" * 30)
# ================= 状态变量 =================
class VadState:
def __init__(self):
self.is_speaking = False # 当前是否处于“说话中”状态
self.consecutive_voice_frames = 0 # 连续检测到声音的帧数
self.silence_start_time = None # 开始静音的时间戳 (使用 time.time())
state = VadState()
def audio_callback(indata, frames, time_info, status):
"""
注意:第三个参数我们改名为 time_info 以避免和 time 模块混淆,
且不再尝试从中读取时间,直接使用 time.time()
"""
if status:
# 只在控制台打印警告,不中断程序
# print(f"状态警告: {status}")
pass
try:
# 1. 数据预处理
# 确保取的是单声道,如果是立体声则取左声道 [:, 0]
if indata.shape[1] > 1:
audio_data = indata[:, 0].astype(np.float32)
else:
audio_data = indata[:, 0].astype(np.float32)
audio_tensor = torch.from_numpy(audio_data)
# 2. 模型推理
with torch.no_grad():
speech_prob = model(audio_tensor, SAMPLE_RATE).item()
# 3. 双阈值逻辑 (Hysteresis)
current_frame_is_voice = False
if speech_prob >= THRESHOLD_HIGH:
current_frame_is_voice = True
elif speech_prob <= THRESHOLD_LOW:
current_frame_is_voice = False
else:
# 在阈值之间,保持上一帧的状态
current_frame_is_voice = state.is_speaking
# 4. 状态机处理
# 【修复点】直接使用 Python 的 time.time(),不依赖回调参数的 time_info
now_ms = time.time() * 1000
if current_frame_is_voice:
# --- 如果当前帧判定为有声音 ---
state.consecutive_voice_frames += 1
# 如果之前不是说话状态,且连续帧数达标 -> 触发“开始说话”
if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
state.is_speaking = True
state.silence_start_time = None # 重置静音计时
print(f">>> [START] 检测到语音 (概率: {speech_prob:.2f})")
# 如果已经在说话状态,只要有一帧是有声,就重置静音计时
if state.is_speaking:
state.silence_start_time = None
else:
# --- 如果当前帧判定为无声音 ---
state.consecutive_voice_frames = 0 # 重置连续计数
if state.is_speaking:
# 如果之前是说话状态,现在变静音了 -> 开始计时
if state.silence_start_time is None:
state.silence_start_time = now_ms
# 计算静音持续时长
silence_duration = now_ms - state.silence_start_time
if silence_duration >= SILENCE_DURATION_MS:
# 静音时间足够长 -> 触发“说完一句话”
state.is_speaking = False
state.silence_start_time = None
print(f"<<< [END] 句子结束 (静默 {silence_duration:.0f}ms)")
except Exception as e:
# 在回调中捕获异常并打印,防止 sounddevice 崩溃
print(f"回调内部错误: {e}")
# 5. 启动麦克风流
try:
with sd.InputStream(
samplerate=SAMPLE_RATE,
channels=1,
blocksize=BLOCK_SIZE,
dtype=np.float32,
callback=audio_callback
):
print("监听中... (按 Ctrl+C 停止)")
while True:
sd.sleep(1000)
except KeyboardInterrupt:
print("\n程序已终止。")
except Exception as e:
print(f"启动错误: {e}")
print("提示:请检查麦克风是否被占用或权限设置。")
vad & asr & tts
import torch
import numpy as np
import sounddevice as sd
import time
import sys
import os
import threading
from funasr import AutoModel
import pyttsx3
# ================= 1. 配置区域 =================
# --- 音频参数 ---
SAMPLE_RATE = 16000 # Silero 和 SenseVoice 通常推荐 16kHz
BLOCK_SIZE = 512 # 每次处理 512 采样点 (~32ms)
# --- VAD 参数 (Silero) ---
THRESHOLD_HIGH = 0.5 # 确认为“有声音”
THRESHOLD_LOW = 0.2 # 确认为“无声音”
MIN_SPEECH_FRAMES = 3 # 连续 3 帧 (>96ms) 判定为开始说话
SILENCE_DURATION_MS = 800 # 持续静音 800ms 判定为说完
# --- 路径配置 ---
# Silero VAD 模型路径 (你提供的本地路径)
VAD_MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'
# 注意:你代码中写的是 snakers4_silero-vad,请确保文件夹名正确,通常是 silero_vad 或包含 model.jit 的文件夹
# FunASR 模型路径
ASR_MODEL_DIR = r"C:\Users\wjn\yx\hello-xiaoxin\models\asr\SenseVoiceSmall"
# ================= 2. 全局变量与状态 =================
# 音频缓冲区:用于存储当前这句话的音频数据
audio_buffer = []
is_recording = False # 是否正在录制当前句子
# 加载模型的全局对象
vad_model = None
vad_utils = None
asr_model = None
tts_engine = None
class VadState:
def __init__(self):
self.is_speaking = False
self.consecutive_voice_frames = 0
self.silence_start_time = None
state = VadState()
# ================= 3. 模型初始化函数 =================
def load_models():
global vad_model, vad_utils, asr_model, tts_engine
print("🚀 正在初始化系统模型...")
# 1. 加载 Silero VAD
print(f"👂 加载 VAD 模型: {VAD_MODEL_PATH}")
try:
# 使用 torch.hub 加载本地模型
# 注意:source="local" 时,repo_or_dir 应指向包含 hubconf.py 的目录或直接是模型文件
# 如果是 silero 官方结构,通常直接加载 'silero_vad'
model, utils = torch.hub.load(
repo_or_dir=VAD_MODEL_PATH,
source="local",
model='silero_vad',
force_reload=False,
trust_repo=True
)
vad_model, vad_utils = model, utils
print("✅ VAD 模型加载成功")
except Exception as e:
print(f"❌ VAD 加载失败: {e}")
print("💡 请检查路径是否包含 silero_vad 的 jit 模型文件或 hubconf.py")
sys.exit(1)
# 2. 加载 FunASR
print(f"🧠 加载 ASR 模型: {ASR_MODEL_DIR}")
try:
asr_model = AutoModel(
model=ASR_MODEL_DIR,
vad_model="fsmn-vad", # 内部 VAD 可以关闭或保留作为二次确认,这里主要依赖外部 Silero
vad_kwargs={"max_single_segment_time": 30000},
device="cpu",
disable_update=True
)
print("✅ ASR 模型加载成功")
except Exception as e:
print(f"❌ ASR 加载失败: {e}")
sys.exit(1)
# 3. 初始化 TTS
print("🗣️ 初始化 TTS 引擎...")
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150)
tts_engine.setProperty('volume', 1.0)
# 尝试设置中文声音
voices = tts_engine.getProperty('voices')
for voice in voices:
if 'zh' in voice.id.lower() or 'chinese' in voice.name.lower():
tts_engine.setProperty('voice', voice.id)
print(f"✅ 已设置中文语音: {voice.name}")
break
print("✅ 系统初始化完成!\n")
# ================= 4. 核心业务逻辑 =================
def process_sentence(audio_data_np):
"""
当 VAD 检测到一句话结束时,调用此函数:
1. 调用 FunASR 识别
2. 调用 pyttsx3 朗读
"""
global audio_buffer
if len(audio_data_np) == 0:
return
print(f"\n🔍 正在识别 ({len(audio_data_np) / 16000:.2f}秒音频)...")
try:
# FunASR 需要 float32 numpy array 或 list
# 输入形状通常为 [samples] 或 [1, samples],SenseVoice 通常接受一维
res = asr_model.generate(input=[audio_data_np], cache={}, batch_size_s=0)
if res and len(res) > 0:
text = res[0].get("text", "").strip()
if not text:
print("⚠️ 识别结果为空")
return
print(f"✨ 识别结果: {text}")
# 朗读
print("🔊 正在朗读...")
tts_engine.say(text)
tts_engine.runAndWait()
print("✅ 朗读结束,继续监听...\n")
else:
print("⚠️ ASR 未返回有效结果")
except Exception as e:
print(f"❌ 识别或朗读出错: {e}")
# ================= 5. 音频回调函数 =================
def audio_callback(indata, frames, time_info, status):
"""
SoundDevice 的实时回调
"""
global audio_buffer, is_recording
if status:
# 忽略轻微的状态警告
pass
try:
# 1. 数据预处理 (取单声道,转 float32)
if indata.shape[1] > 1:
audio_data = indata[:, 0].astype(np.float32)
else:
audio_data = indata[:, 0].astype(np.float32)
audio_tensor = torch.from_numpy(audio_data)
# 2. Silero VAD 推理
with torch.no_grad():
# get_speech_timestamps 返回复杂结构,这里直接用模型输出概率
# silero_vad 模型直接调用返回 prob
speech_prob = vad_model(audio_tensor, SAMPLE_RATE).item()
# 3. 双阈值逻辑
current_frame_is_voice = False
if speech_prob >= THRESHOLD_HIGH:
current_frame_is_voice = True
elif speech_prob <= THRESHOLD_LOW:
current_frame_is_voice = False
else:
current_frame_is_voice = state.is_speaking
# 4. 状态机
now_ms = time.time() * 1000
if current_frame_is_voice:
state.consecutive_voice_frames += 1
# 触发 "开始说话"
if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
state.is_speaking = True
state.silence_start_time = None
audio_buffer = [] # 清空旧缓冲,开始新录音
is_recording = True
print(f">>> [START] 检测到语音 (Prob: {speech_prob:.2f})")
# 录音中
if is_recording:
audio_buffer.append(audio_data.copy()) # 存入缓冲
# 只要在说话,重置静音计时
if state.is_speaking:
state.silence_start_time = None
else:
state.consecutive_voice_frames = 0
if state.is_speaking:
# 开始计时静音
if state.silence_start_time is None:
state.silence_start_time = now_ms
silence_duration = now_ms - state.silence_start_time
# 触发 "说完话"
if silence_duration >= SILENCE_DURATION_MS:
state.is_speaking = False
state.silence_start_time = None
is_recording = False
print(f"<<< [END] 句子结束 (Silence: {silence_duration:.0f}ms)")
# --- 关键动作:处理音频 ---
if len(audio_buffer) > 0:
full_audio = np.concatenate(audio_buffer)
# 注意:process_sentence 可能会耗时,为了不让回调阻塞太久
# 在实际生产中建议放入队列由后台线程处理。
# 这里为了简单直接调用,如果识别慢,麦克风会暂停接收新数据直到本函数返回
# 但由于我们在回调外没有显式停止 stream,sounddevice 的缓冲区可能会满
# 改进方案:在回调中只标记标志位,主循环处理;或者这里直接处理并接受短暂延迟
process_sentence(full_audio)
else:
print("⚠️ 缓冲区为空,跳过识别")
except Exception as e:
print(f"❌ 回调错误: {e}")
# ================= 6. 主程序入口 =================
def main():
load_models()
print("-" * 30)
print("🎙️ 系统已就绪,请开始说话...")
print(" (按 Ctrl+C 退出)")
print("-" * 30)
try:
# 启动音频流
with sd.InputStream(
samplerate=SAMPLE_RATE,
channels=1,
blocksize=BLOCK_SIZE,
dtype=np.float32,
callback=audio_callback
):
while True:
sd.sleep(1000) # 保持主线程运行
except KeyboardInterrupt:
print("\n👋 程序已终止。")
except Exception as e:
print(f"❌ 启动失败: {e}")
print("💡 请检查麦克风权限或是否被其他程序占用。")
if __name__ == "__main__":
main()
tts 不能用
import torch
import numpy as np
import sounddevice as sd
import time
import threading
import queue
import re
from funasr import AutoModel
import pyttsx3
# ================= 1. 配置 =================
SAMPLE_RATE = 16000
BLOCK_SIZE = 512
THRESHOLD_HIGH = 0.5
THRESHOLD_LOW = 0.2
MIN_SPEECH_FRAMES = 3
SILENCE_DURATION_MS = 600
VAD_MODEL_PATH = r'C:\Users\wjn\github\xiaozhi-esp32-server\main\xiaozhi-server\models\snakers4_silero-vad'
ASR_MODEL_DIR = r"C:\Users\wjn\yx\hello-xiaoxin\models\asr\SenseVoiceSmall"
# ================= 2. 全局变量 =================
audio_queue = queue.Queue()
tts_queue = queue.Queue()
vad_model = None
asr_model = None
tts_engine = None
audio_buffer = []
is_recording = False
class VadState:
def __init__(self):
self.is_speaking = False
self.consecutive_voice_frames = 0
self.silence_start_time = None
state = VadState()
# ================= 3. 模型初始化 =================
def load_models():
global vad_model, asr_model, tts_engine
print("🚀 初始化模型...")
# VAD
model, utils = torch.hub.load(
repo_or_dir=VAD_MODEL_PATH,
source="local",
model='silero_vad',
force_reload=False,
trust_repo=True
)
vad_model = model
print("✅ VAD 加载完成")
# ASR
asr_model = AutoModel(
model=ASR_MODEL_DIR,
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device="cpu",
disable_update=True
)
print("✅ ASR 加载完成")
# TTS
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150)
tts_engine.setProperty('volume', 1.0)
print("✅ TTS 初始化完成")
# ================= 4. 文本清理 =================
def clean_text(text):
# 去除 <|xxx|> 标签
text = re.sub(r"<\|.*?\|>", "", text)
return text.strip()
# ================= 5. ASR线程 =================
def asr_thread():
print("🧠 ASR线程启动")
while True:
audio_data = audio_queue.get()
if audio_data is None:
break
try:
print(f"\n🔍 识别中 ({len(audio_data)/16000:.2f}秒)...")
res = asr_model.generate(
input=[audio_data],
cache={},
batch_size_s=0
)
if res and len(res) > 0:
raw_text = res[0].get("text", "").strip()
text = clean_text(raw_text)
if text:
print(f"✨ 识别结果: {text}")
tts_queue.put(text)
else:
print("⚠️ 空识别结果")
except Exception as e:
print(f"❌ ASR错误: {e}")
finally:
audio_queue.task_done()
# ================= 6. TTS线程 =================
def tts_thread():
print("🔊 TTS线程启动")
while True:
text = tts_queue.get()
if text is None:
break
try:
print("🔊 朗读中...")
tts_engine.say(text)
tts_engine.runAndWait()
print("✅ 朗读完成")
except Exception as e:
print(f"❌ TTS错误: {e}")
finally:
tts_queue.task_done()
# ================= 7. 音频回调 =================
def audio_callback(indata, frames, time_info, status):
global audio_buffer, is_recording
try:
audio_data = indata[:, 0].astype(np.float32)
audio_tensor = torch.from_numpy(audio_data)
with torch.no_grad():
speech_prob = vad_model(audio_tensor, SAMPLE_RATE).item()
if speech_prob >= THRESHOLD_HIGH:
current_voice = True
elif speech_prob <= THRESHOLD_LOW:
current_voice = False
else:
current_voice = state.is_speaking
now_ms = time.time() * 1000
if current_voice:
state.consecutive_voice_frames += 1
if not state.is_speaking and state.consecutive_voice_frames >= MIN_SPEECH_FRAMES:
state.is_speaking = True
state.silence_start_time = None
audio_buffer.clear()
is_recording = True
print(">>> [START]")
if is_recording:
audio_buffer.append(audio_data.copy())
else:
state.consecutive_voice_frames = 0
if state.is_speaking:
if state.silence_start_time is None:
state.silence_start_time = now_ms
silence_duration = now_ms - state.silence_start_time
if silence_duration >= SILENCE_DURATION_MS:
state.is_speaking = False
state.silence_start_time = None
is_recording = False
print("<<< [END]")
if len(audio_buffer) > 0:
full_audio = np.concatenate(audio_buffer)
audio_queue.put(full_audio)
except Exception as e:
print(f"❌ 回调异常: {e}")
# ================= 8. 主函数 =================
def main():
load_models()
threading.Thread(target=asr_thread, daemon=True).start()
threading.Thread(target=tts_thread, daemon=True).start()
print("🎙️ 系统就绪,请说话...")
try:
with sd.InputStream(
samplerate=SAMPLE_RATE,
channels=1,
blocksize=BLOCK_SIZE,
dtype=np.float32,
callback=audio_callback
):
while True:
sd.sleep(1000)
except KeyboardInterrupt:
print("\n👋 已退出")
audio_queue.put(None)
tts_queue.put(None)
if __name__ == "__main__":
main()