feat: upgrade voice pipeline — MLX Whisper STT (20x faster), Qwen3.5 MoE LLM, fix HA tool calling

- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU)
  STT latency: 8.4s → 400ms for short voice commands
- Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B)
- Add model preload launchd service to pin voice model in VRAM permanently
- Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH
- Add pipeline benchmark script (STT/LLM/TTS latency profiling)
- Add service restart buttons and STT endpoint to dashboard
- Bind Vite dev server to 0.0.0.0 for LAN access

Total estimated pipeline latency: ~27s → ~4s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-13 18:03:12 +00:00
parent 1bfd7fbd08
commit af6b7bd945
10 changed files with 721 additions and 27 deletions

View File

@@ -34,6 +34,7 @@ import wave
import io
from wyoming.client import AsyncTcpClient
from wyoming.tts import Synthesize, SynthesizeVoice
from wyoming.asr import Transcribe, Transcript
from wyoming.audio import AudioStart, AudioChunk, AudioStop
from wyoming.info import Info
@@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
self._handle_tts_request()
return
# Handle STT requests
if parsed_path.path == "/api/stt":
self._handle_stt_request()
return
# Only handle the agent message endpoint
if parsed_path.path == "/api/agent/message":
self._handle_agent_request()
@@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
return wav_io.getvalue()
def _handle_stt_request(self):
"""Handle STT request — accept WAV audio, return transcribed text."""
content_length = int(self.headers.get("Content-Length", 0))
if content_length == 0:
self._send_json_response(400, {"error": "Empty body"})
return
try:
audio_bytes = self.rfile.read(content_length)
# Parse WAV to get PCM data and format
wav_io = io.BytesIO(audio_bytes)
with wave.open(wav_io, 'rb') as wav_file:
rate = wav_file.getframerate()
width = wav_file.getsampwidth()
channels = wav_file.getnchannels()
pcm_data = wav_file.readframes(wav_file.getnframes())
# Run the async Wyoming client
text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels))
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
self.wfile.write(json.dumps({"text": text}).encode())
except wave.Error as e:
self._send_json_response(400, {"error": f"Invalid WAV: {e}"})
except Exception as e:
self._send_json_response(500, {"error": str(e)})
async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str:
"""Connect to Wyoming STT server and transcribe audio."""
client = AsyncTcpClient("127.0.0.1", 10300)
await client.connect()
# Send Transcribe request (STT server does not send an initial Info event)
await client.write_event(Transcribe(language="en").event())
# Send audio
await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
# Send in chunks (1 second each)
bytes_per_second = rate * width * channels
for offset in range(0, len(pcm_data), bytes_per_second):
chunk = pcm_data[offset:offset + bytes_per_second]
await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event())
await client.write_event(AudioStop().event())
# Read transcript
while True:
event = await client.read_event()
if event is None:
break
if Transcript.is_type(event.type):
transcript = Transcript.from_event(event)
await client.disconnect()
return transcript.text
await client.disconnect()
return ""
def _handle_wake_word(self):
"""Handle wake word detection notification."""
content_length = int(self.headers.get("Content-Length", 0))