feat: upgrade voice pipeline — MLX Whisper STT (20x faster), Qwen3.5 MoE LLM, fix HA tool calling

- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) STT latency: 8.4s → 400ms for short voice commands - Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B) - Add model preload launchd service to pin voice model in VRAM permanently - Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH - Add pipeline benchmark script (STT/LLM/TTS latency profiling) - Add service restart buttons and STT endpoint to dashboard - Bind Vite dev server to 0.0.0.0 for LAN access Total estimated pipeline latency: ~27s → ~4s Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:03:12 +00:00
parent 1bfd7fbd08
commit af6b7bd945
10 changed files with 721 additions and 27 deletions
--- a/homeai-agent/openclaw-http-bridge.py
+++ b/homeai-agent/openclaw-http-bridge.py
@@ -34,6 +34,7 @@ import wave
 import io
 from wyoming.client import AsyncTcpClient
 from wyoming.tts import Synthesize, SynthesizeVoice
+from wyoming.asr import Transcribe, Transcript
 from wyoming.audio import AudioStart, AudioChunk, AudioStop
 from wyoming.info import Info

@@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
            self._handle_tts_request()
            return

+        # Handle STT requests
+        if parsed_path.path == "/api/stt":
+            self._handle_stt_request()
+            return
+
        # Only handle the agent message endpoint
        if parsed_path.path == "/api/agent/message":
            self._handle_agent_request()
@@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
            
        return wav_io.getvalue()

+    def _handle_stt_request(self):
+        """Handle STT request — accept WAV audio, return transcribed text."""
+        content_length = int(self.headers.get("Content-Length", 0))
+        if content_length == 0:
+            self._send_json_response(400, {"error": "Empty body"})
+            return
+
+        try:
+            audio_bytes = self.rfile.read(content_length)
+
+            # Parse WAV to get PCM data and format
+            wav_io = io.BytesIO(audio_bytes)
+            with wave.open(wav_io, 'rb') as wav_file:
+                rate = wav_file.getframerate()
+                width = wav_file.getsampwidth()
+                channels = wav_file.getnchannels()
+                pcm_data = wav_file.readframes(wav_file.getnframes())
+
+            # Run the async Wyoming client
+            text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels))
+
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Access-Control-Allow-Origin", "*")
+            self.end_headers()
+            self.wfile.write(json.dumps({"text": text}).encode())
+
+        except wave.Error as e:
+            self._send_json_response(400, {"error": f"Invalid WAV: {e}"})
+        except Exception as e:
+            self._send_json_response(500, {"error": str(e)})
+
+    async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str:
+        """Connect to Wyoming STT server and transcribe audio."""
+        client = AsyncTcpClient("127.0.0.1", 10300)
+        await client.connect()
+
+        # Send Transcribe request (STT server does not send an initial Info event)
+        await client.write_event(Transcribe(language="en").event())
+
+        # Send audio
+        await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
+
+        # Send in chunks (1 second each)
+        bytes_per_second = rate * width * channels
+        for offset in range(0, len(pcm_data), bytes_per_second):
+            chunk = pcm_data[offset:offset + bytes_per_second]
+            await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event())
+
+        await client.write_event(AudioStop().event())
+
+        # Read transcript
+        while True:
+            event = await client.read_event()
+            if event is None:
+                break
+            if Transcript.is_type(event.type):
+                transcript = Transcript.from_event(event)
+                await client.disconnect()
+                return transcript.text
+
+        await client.disconnect()
+        return ""
+
    def _handle_wake_word(self):
        """Handle wake word detection notification."""
        content_length = int(self.headers.get("Content-Length", 0))