feat: upgrade voice pipeline — MLX Whisper STT (20x faster), Qwen3.5 MoE LLM, fix HA tool calling
- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) STT latency: 8.4s → 400ms for short voice commands - Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B) - Add model preload launchd service to pin voice model in VRAM permanently - Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH - Add pipeline benchmark script (STT/LLM/TTS latency profiling) - Add service restart buttons and STT endpoint to dashboard - Bind Vite dev server to 0.0.0.0 for LAN access Total estimated pipeline latency: ~27s → ~4s Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ import wave
|
||||
import io
|
||||
from wyoming.client import AsyncTcpClient
|
||||
from wyoming.tts import Synthesize, SynthesizeVoice
|
||||
from wyoming.asr import Transcribe, Transcript
|
||||
from wyoming.audio import AudioStart, AudioChunk, AudioStop
|
||||
from wyoming.info import Info
|
||||
|
||||
@@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
self._handle_tts_request()
|
||||
return
|
||||
|
||||
# Handle STT requests
|
||||
if parsed_path.path == "/api/stt":
|
||||
self._handle_stt_request()
|
||||
return
|
||||
|
||||
# Only handle the agent message endpoint
|
||||
if parsed_path.path == "/api/agent/message":
|
||||
self._handle_agent_request()
|
||||
@@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
|
||||
return wav_io.getvalue()
|
||||
|
||||
def _handle_stt_request(self):
|
||||
"""Handle STT request — accept WAV audio, return transcribed text."""
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
if content_length == 0:
|
||||
self._send_json_response(400, {"error": "Empty body"})
|
||||
return
|
||||
|
||||
try:
|
||||
audio_bytes = self.rfile.read(content_length)
|
||||
|
||||
# Parse WAV to get PCM data and format
|
||||
wav_io = io.BytesIO(audio_bytes)
|
||||
with wave.open(wav_io, 'rb') as wav_file:
|
||||
rate = wav_file.getframerate()
|
||||
width = wav_file.getsampwidth()
|
||||
channels = wav_file.getnchannels()
|
||||
pcm_data = wav_file.readframes(wav_file.getnframes())
|
||||
|
||||
# Run the async Wyoming client
|
||||
text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels))
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps({"text": text}).encode())
|
||||
|
||||
except wave.Error as e:
|
||||
self._send_json_response(400, {"error": f"Invalid WAV: {e}"})
|
||||
except Exception as e:
|
||||
self._send_json_response(500, {"error": str(e)})
|
||||
|
||||
async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str:
|
||||
"""Connect to Wyoming STT server and transcribe audio."""
|
||||
client = AsyncTcpClient("127.0.0.1", 10300)
|
||||
await client.connect()
|
||||
|
||||
# Send Transcribe request (STT server does not send an initial Info event)
|
||||
await client.write_event(Transcribe(language="en").event())
|
||||
|
||||
# Send audio
|
||||
await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
|
||||
|
||||
# Send in chunks (1 second each)
|
||||
bytes_per_second = rate * width * channels
|
||||
for offset in range(0, len(pcm_data), bytes_per_second):
|
||||
chunk = pcm_data[offset:offset + bytes_per_second]
|
||||
await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event())
|
||||
|
||||
await client.write_event(AudioStop().event())
|
||||
|
||||
# Read transcript
|
||||
while True:
|
||||
event = await client.read_event()
|
||||
if event is None:
|
||||
break
|
||||
if Transcript.is_type(event.type):
|
||||
transcript = Transcript.from_event(event)
|
||||
await client.disconnect()
|
||||
return transcript.text
|
||||
|
||||
await client.disconnect()
|
||||
return ""
|
||||
|
||||
def _handle_wake_word(self):
|
||||
"""Handle wake word detection notification."""
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
|
||||
Reference in New Issue
Block a user