From af6b7bd94508bfd10d65450957584e99e3b37a77 Mon Sep 17 00:00:00 2001
From: Aodhan Collins <sudosert@prontonmail.com>
Date: Fri, 13 Mar 2026 18:03:12 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20upgrade=20voice=20pipeline=20=E2=80=94?=
 =?UTF-8?q?=20MLX=20Whisper=20STT=20(20x=20faster),=20Qwen3.5=20MoE=20LLM,?=
 =?UTF-8?q?=20fix=20HA=20tool=20calling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU)
  STT latency: 8.4s → 400ms for short voice commands
- Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B)
- Add model preload launchd service to pin voice model in VRAM permanently
- Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH
- Add pipeline benchmark script (STT/LLM/TTS latency profiling)
- Add service restart buttons and STT endpoint to dashboard
- Bind Vite dev server to 0.0.0.0 for LAN access

Total estimated pipeline latency: ~27s → ~4s

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                                  |   9 +
 TODO.md                                       |  10 +-
 homeai-agent/openclaw-http-bridge.py          |  70 ++++
 homeai-character/src/ServiceStatus.jsx        |  97 ++++-
 homeai-character/vite.config.js               |  67 +++
 .../launchd/com.homeai.preload-models.plist   |  28 ++
 .../modelfiles/Qwen3.5-35B-A3B.Modelfile      |  55 +++
 homeai-llm/scripts/preload-models.sh          |  19 +
 homeai-voice/scripts/benchmark_pipeline.py    | 381 ++++++++++++++++++
 .../launchd/com.homeai.wyoming-stt.plist      |  12 +-
 10 files changed, 721 insertions(+), 27 deletions(-)
 create mode 100644 homeai-llm/launchd/com.homeai.preload-models.plist
 create mode 100644 homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
 create mode 100755 homeai-llm/scripts/preload-models.sh
 create mode 100644 homeai-voice/scripts/benchmark_pipeline.py

diff --git a/.env.example b/.env.example
index 87eb9c2..c58eddb 100644
--- a/.env.example
+++ b/.env.example
@@ -2,6 +2,14 @@
 # Copy to .env and fill in your values.
 # .env is gitignored — never commit it.
 
+# ─── API Keys ──────────────────────────────────────────────────────────────────
+HUGGING_FACE_API_KEY=
+OPENROUTER_API_KEY=
+OPENAI_API_KEY=
+DEEPSEEK_API_KEY=
+GEMINI_API_KEY=
+ELEVENLABS_API_KEY=
+
 # ─── Data & Paths ──────────────────────────────────────────────────────────────
 DATA_DIR=${HOME}/homeai-data
 REPO_DIR=${HOME}/Projects/HomeAI
@@ -45,3 +53,4 @@ VTUBE_WS_URL=ws://localhost:8001
 
 # ─── P8: Images ────────────────────────────────────────────────────────────────
 COMFYUI_URL=http://localhost:8188
+
diff --git a/TODO.md b/TODO.md
index 95a1f48..e255244 100644
--- a/TODO.md
+++ b/TODO.md
@@ -25,9 +25,11 @@
 - [x] Write and load launchd plist (`com.homeai.ollama.plist`) — `/opt/homebrew/bin/ollama`
 - [x] Register local GGUF models via Modelfiles (no download): llama3.3:70b, qwen3:32b, codestral:22b, qwen2.5:7b
 - [x] Register additional models: EVA-LLaMA-3.33-70B, Midnight-Miqu-70B, QwQ-32B, Qwen3.5-35B, Qwen3-Coder-30B, Qwen3-VL-30B, GLM-4.6V-Flash, DeepSeek-R1-8B, gemma-3-27b
+- [x] Add qwen3.5:35b-a3b (MoE, Q8_0) — 26.7 tok/s, recommended for voice pipeline
+- [x] Write model preload script + launchd service (keeps voice model in VRAM permanently)
 - [x] Deploy Open WebUI via Docker compose (port 3030)
 - [x] Verify Open WebUI connected to Ollama, all models available
-- [ ] Run `scripts/benchmark.sh` — record results in `benchmark-results.md`
+- [x] Run pipeline benchmark (homeai-voice/scripts/benchmark_pipeline.py) — STT/LLM/TTS latency profiled
 - [ ] Add Ollama + Open WebUI to Uptime Kuma monitors
 
 ---
@@ -37,6 +39,7 @@
 ### P3 · homeai-voice
 
 - [x] Install `wyoming-faster-whisper` — model: faster-whisper-large-v3 (auto-downloaded)
+- [x] Upgrade STT to wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) — 20x faster (8s → 400ms)
 - [x] Install Kokoro ONNX TTS — models at `~/models/kokoro/`
 - [x] Write Wyoming-Kokoro adapter server (`homeai-voice/tts/wyoming_kokoro_server.py`)
 - [x] Write + load launchd plists for Wyoming STT (10300) and TTS (10301)
@@ -67,10 +70,11 @@
 - [x] Fix context window: set `contextWindow=32768` for llama3.3:70b in `openclaw.json`
 - [x] Fix Llama 3.3 Modelfile: add tool-calling TEMPLATE block
 - [x] Verify `openclaw agent --message "..." --agent main` → completed
-- [x] Write `skills/home-assistant` SKILL.md — HA REST API control
+- [x] Write `skills/home-assistant` SKILL.md — HA REST API control via ha-ctl CLI
 - [x] Write `skills/voice-assistant` SKILL.md — voice response style guide
 - [x] Wire HASS_TOKEN — create `~/.homeai/hass_token` or set env in launchd plist
-- [x] Test home-assistant skill: "turn on/off the reading lamp"
+- [x] Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH, update TOOLS.md
+- [x] Test home-assistant skill: "turn on/off the reading lamp" — verified exec→ha-ctl→HA action
 - [x] Set up mem0 with Chroma backend, test semantic recall
 - [x] Write memory backup launchd job
 - [x] Build morning briefing n8n workflow
diff --git a/homeai-agent/openclaw-http-bridge.py b/homeai-agent/openclaw-http-bridge.py
index 1d3e3b2..e3cfbe1 100644
--- a/homeai-agent/openclaw-http-bridge.py
+++ b/homeai-agent/openclaw-http-bridge.py
@@ -34,6 +34,7 @@ import wave
 import io
 from wyoming.client import AsyncTcpClient
 from wyoming.tts import Synthesize, SynthesizeVoice
+from wyoming.asr import Transcribe, Transcript
 from wyoming.audio import AudioStart, AudioChunk, AudioStop
 from wyoming.info import Info
 
@@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
             self._handle_tts_request()
             return
 
+        # Handle STT requests
+        if parsed_path.path == "/api/stt":
+            self._handle_stt_request()
+            return
+
         # Only handle the agent message endpoint
         if parsed_path.path == "/api/agent/message":
             self._handle_agent_request()
@@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
             
         return wav_io.getvalue()
 
+    def _handle_stt_request(self):
+        """Handle STT request — accept WAV audio, return transcribed text."""
+        content_length = int(self.headers.get("Content-Length", 0))
+        if content_length == 0:
+            self._send_json_response(400, {"error": "Empty body"})
+            return
+
+        try:
+            audio_bytes = self.rfile.read(content_length)
+
+            # Parse WAV to get PCM data and format
+            wav_io = io.BytesIO(audio_bytes)
+            with wave.open(wav_io, 'rb') as wav_file:
+                rate = wav_file.getframerate()
+                width = wav_file.getsampwidth()
+                channels = wav_file.getnchannels()
+                pcm_data = wav_file.readframes(wav_file.getnframes())
+
+            # Run the async Wyoming client
+            text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels))
+
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Access-Control-Allow-Origin", "*")
+            self.end_headers()
+            self.wfile.write(json.dumps({"text": text}).encode())
+
+        except wave.Error as e:
+            self._send_json_response(400, {"error": f"Invalid WAV: {e}"})
+        except Exception as e:
+            self._send_json_response(500, {"error": str(e)})
+
+    async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str:
+        """Connect to Wyoming STT server and transcribe audio."""
+        client = AsyncTcpClient("127.0.0.1", 10300)
+        await client.connect()
+
+        # Send Transcribe request (STT server does not send an initial Info event)
+        await client.write_event(Transcribe(language="en").event())
+
+        # Send audio
+        await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
+
+        # Send in chunks (1 second each)
+        bytes_per_second = rate * width * channels
+        for offset in range(0, len(pcm_data), bytes_per_second):
+            chunk = pcm_data[offset:offset + bytes_per_second]
+            await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event())
+
+        await client.write_event(AudioStop().event())
+
+        # Read transcript
+        while True:
+            event = await client.read_event()
+            if event is None:
+                break
+            if Transcript.is_type(event.type):
+                transcript = Transcript.from_event(event)
+                await client.disconnect()
+                return transcript.text
+
+        await client.disconnect()
+        return ""
+
     def _handle_wake_word(self):
         """Handle wake word detection notification."""
         content_length = int(self.headers.get("Content-Length", 0))
diff --git a/homeai-character/src/ServiceStatus.jsx b/homeai-character/src/ServiceStatus.jsx
index 0eaa861..6393204 100644
--- a/homeai-character/src/ServiceStatus.jsx
+++ b/homeai-character/src/ServiceStatus.jsx
@@ -8,6 +8,7 @@ const SERVICES = [
     uiUrl: null,
     description: 'Local LLM runtime',
     category: 'AI & LLM',
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.ollama' },
   },
   {
     name: 'Open WebUI',
@@ -16,6 +17,7 @@ const SERVICES = [
     uiUrl: 'http://localhost:3030',
     description: 'Chat interface',
     category: 'AI & LLM',
+    restart: { type: 'docker', id: 'homeai-open-webui' },
   },
   {
     name: 'OpenClaw Gateway',
@@ -24,6 +26,7 @@ const SERVICES = [
     uiUrl: null,
     description: 'Agent gateway',
     category: 'Agent',
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw' },
   },
   {
     name: 'OpenClaw Bridge',
@@ -32,6 +35,7 @@ const SERVICES = [
     uiUrl: null,
     description: 'HTTP-to-CLI bridge',
     category: 'Agent',
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw-bridge' },
   },
   {
     name: 'Wyoming STT',
@@ -41,6 +45,7 @@ const SERVICES = [
     description: 'Whisper speech-to-text',
     category: 'Voice',
     tcp: true,
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-stt' },
   },
   {
     name: 'Wyoming TTS',
@@ -50,6 +55,7 @@ const SERVICES = [
     description: 'Kokoro text-to-speech',
     category: 'Voice',
     tcp: true,
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-tts' },
   },
   {
     name: 'Wyoming Satellite',
@@ -59,6 +65,16 @@ const SERVICES = [
     description: 'Mac Mini mic/speaker satellite',
     category: 'Voice',
     tcp: true,
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-satellite' },
+  },
+  {
+    name: 'Character Dashboard',
+    url: 'http://localhost:5173',
+    healthPath: '/',
+    uiUrl: 'http://localhost:5173',
+    description: 'Character manager & service status',
+    category: 'Agent',
+    restart: { type: 'launchd', id: 'gui/501/com.homeai.character-dashboard' },
   },
   {
     name: 'Home Assistant',
@@ -75,6 +91,7 @@ const SERVICES = [
     uiUrl: 'http://localhost:3001',
     description: 'Service health monitoring',
     category: 'Infrastructure',
+    restart: { type: 'docker', id: 'homeai-uptime-kuma' },
   },
   {
     name: 'n8n',
@@ -83,6 +100,7 @@ const SERVICES = [
     uiUrl: 'http://localhost:5678',
     description: 'Workflow automation',
     category: 'Infrastructure',
+    restart: { type: 'docker', id: 'homeai-n8n' },
   },
   {
     name: 'code-server',
@@ -91,6 +109,7 @@ const SERVICES = [
     uiUrl: 'http://localhost:8090',
     description: 'Browser-based VS Code',
     category: 'Infrastructure',
+    restart: { type: 'docker', id: 'homeai-code-server' },
   },
   {
     name: 'Portainer',
@@ -155,6 +174,7 @@ export default function ServiceStatus() {
     Object.fromEntries(SERVICES.map(s => [s.name, { status: 'checking', lastCheck: null, responseTime: null }]))
   );
   const [lastRefresh, setLastRefresh] = useState(null);
+  const [restarting, setRestarting] = useState({});
 
   const checkService = useCallback(async (service) => {
     try {
@@ -208,6 +228,31 @@ export default function ServiceStatus() {
     return () => clearInterval(interval);
   }, [refreshAll]);
 
+  const restartService = useCallback(async (service) => {
+    if (!service.restart) return;
+    setRestarting(prev => ({ ...prev, [service.name]: true }));
+    try {
+      const res = await fetch('/api/service/restart', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify(service.restart),
+      });
+      const data = await res.json();
+      if (!data.ok) {
+        console.error(`Restart failed for ${service.name}:`, data.error);
+      }
+      // Wait a moment for the service to come back, then re-check
+      setTimeout(async () => {
+        const result = await checkService(service);
+        setStatuses(prev => ({ ...prev, [service.name]: result }));
+        setRestarting(prev => ({ ...prev, [service.name]: false }));
+      }, 3000);
+    } catch (err) {
+      console.error(`Restart failed for ${service.name}:`, err);
+      setRestarting(prev => ({ ...prev, [service.name]: false }));
+    }
+  }, [checkService]);
+
   const categories = [...new Set(SERVICES.map(s => s.category))];
   const onlineCount = Object.values(statuses).filter(s => s.status === 'online').length;
   const offlineCount = Object.values(statuses).filter(s => s.status === 'offline').length;
@@ -293,19 +338,45 @@ export default function ServiceStatus() {
                         <p className="text-xs text-gray-600 mt-0.5">{st.responseTime}ms</p>
                       )}
                     </div>
-                    {service.uiUrl && (
-                      <a
-                        href={service.uiUrl}
-                        target="_blank"
-                        rel="noopener noreferrer"
-                        className="text-xs px-2.5 py-1 rounded-md bg-gray-700 hover:bg-gray-600 text-gray-300 transition-colors flex items-center gap-1"
-                      >
-                        Open
-                        <svg className="w-3 h-3" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
-                          <path strokeLinecap="round" strokeLinejoin="round" d="M13.5 6H5.25A2.25 2.25 0 003 8.25v10.5A2.25 2.25 0 005.25 21h10.5A2.25 2.25 0 0018 18.75V10.5m-10.5 6L21 3m0 0h-5.25M21 3v5.25" />
-                        </svg>
-                      </a>
-                    )}
+                    <div className="flex items-center gap-2">
+                      {service.restart && st.status === 'offline' && (
+                        <button
+                          onClick={() => restartService(service)}
+                          disabled={restarting[service.name]}
+                          className="text-xs px-2.5 py-1 rounded-md bg-amber-600/80 hover:bg-amber-500 disabled:bg-gray-700 disabled:text-gray-500 text-white transition-colors flex items-center gap-1"
+                        >
+                          {restarting[service.name] ? (
+                            <>
+                              <svg className="w-3 h-3 animate-spin" fill="none" viewBox="0 0 24 24">
+                                <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
+                                <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
+                              </svg>
+                              Restarting
+                            </>
+                          ) : (
+                            <>
+                              <svg className="w-3 h-3" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
+                                <path strokeLinecap="round" strokeLinejoin="round" d="M5.636 18.364a9 9 0 010-12.728m12.728 0a9 9 0 010 12.728M12 9v3m0 0v3m0-3h3m-3 0H9" />
+                              </svg>
+                              Restart
+                            </>
+                          )}
+                        </button>
+                      )}
+                      {service.uiUrl && (
+                        <a
+                          href={service.uiUrl}
+                          target="_blank"
+                          rel="noopener noreferrer"
+                          className="text-xs px-2.5 py-1 rounded-md bg-gray-700 hover:bg-gray-600 text-gray-300 transition-colors flex items-center gap-1"
+                        >
+                          Open
+                          <svg className="w-3 h-3" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
+                            <path strokeLinecap="round" strokeLinejoin="round" d="M13.5 6H5.25A2.25 2.25 0 003 8.25v10.5A2.25 2.25 0 005.25 21h10.5A2.25 2.25 0 0018 18.75V10.5m-10.5 6L21 3m0 0h-5.25M21 3v5.25" />
+                          </svg>
+                        </a>
+                      )}
+                    </div>
                   </div>
                 </div>
               );
diff --git a/homeai-character/vite.config.js b/homeai-character/vite.config.js
index f56a79b..7b54513 100644
--- a/homeai-character/vite.config.js
+++ b/homeai-character/vite.config.js
@@ -53,6 +53,70 @@ function healthCheckPlugin() {
           res.end(JSON.stringify({ status: 'offline', responseTime: null }));
         }
       });
+      // Service restart — runs launchctl or docker restart
+      server.middlewares.use('/api/service/restart', async (req, res) => {
+        if (req.method === 'OPTIONS') {
+          res.writeHead(204, { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'Content-Type' });
+          res.end();
+          return;
+        }
+        if (req.method !== 'POST') {
+          res.writeHead(405);
+          res.end();
+          return;
+        }
+        try {
+          const chunks = [];
+          for await (const chunk of req) chunks.push(chunk);
+          const { type, id } = JSON.parse(Buffer.concat(chunks).toString());
+
+          if (!type || !id) {
+            res.writeHead(400, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify({ ok: false, error: 'Missing type or id' }));
+            return;
+          }
+
+          // Whitelist valid service IDs to prevent command injection
+          const ALLOWED_LAUNCHD = [
+            'gui/501/com.homeai.ollama',
+            'gui/501/com.homeai.openclaw',
+            'gui/501/com.homeai.openclaw-bridge',
+            'gui/501/com.homeai.wyoming-stt',
+            'gui/501/com.homeai.wyoming-tts',
+            'gui/501/com.homeai.wyoming-satellite',
+            'gui/501/com.homeai.character-dashboard',
+          ];
+          const ALLOWED_DOCKER = [
+            'homeai-open-webui',
+            'homeai-uptime-kuma',
+            'homeai-n8n',
+            'homeai-code-server',
+          ];
+
+          let cmd;
+          if (type === 'launchd' && ALLOWED_LAUNCHD.includes(id)) {
+            cmd = ['launchctl', 'kickstart', '-k', id];
+          } else if (type === 'docker' && ALLOWED_DOCKER.includes(id)) {
+            cmd = ['docker', 'restart', id];
+          } else {
+            res.writeHead(403, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify({ ok: false, error: 'Service not in allowed list' }));
+            return;
+          }
+
+          const { execFile } = await import('child_process');
+          const { promisify } = await import('util');
+          const execFileAsync = promisify(execFile);
+          const { stdout, stderr } = await execFileAsync(cmd[0], cmd.slice(1), { timeout: 30000 });
+
+          res.writeHead(200, { 'Content-Type': 'application/json' });
+          res.end(JSON.stringify({ ok: true, stdout: stdout.trim(), stderr: stderr.trim() }));
+        } catch (err) {
+          res.writeHead(500, { 'Content-Type': 'application/json' });
+          res.end(JSON.stringify({ ok: false, error: err.message }));
+        }
+      });
+
       // TTS preview proxy — forwards POST to OpenClaw bridge, returns audio
       server.middlewares.use('/api/tts', async (req, res) => {
         if (req.method !== 'POST') {
@@ -99,4 +163,7 @@ export default defineConfig({
     tailwindcss(),
     react(),
   ],
+  server: {
+    host: '0.0.0.0',
+  },
 })
diff --git a/homeai-llm/launchd/com.homeai.preload-models.plist b/homeai-llm/launchd/com.homeai.preload-models.plist
new file mode 100644
index 0000000..e7b209e
--- /dev/null
+++ b/homeai-llm/launchd/com.homeai.preload-models.plist
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
+  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key>
+  <string>com.homeai.preload-models</string>
+
+  <key>ProgramArguments</key>
+  <array>
+    <string>/bin/bash</string>
+    <string>/Users/aodhan/gitea/homeai/homeai-llm/scripts/preload-models.sh</string>
+  </array>
+
+  <key>RunAtLoad</key>
+  <true/>
+
+  <key>StandardOutPath</key>
+  <string>/tmp/homeai-preload-models.log</string>
+
+  <key>StandardErrorPath</key>
+  <string>/tmp/homeai-preload-models-error.log</string>
+
+  <!-- Delay 15s to let Ollama start first -->
+  <key>ThrottleInterval</key>
+  <integer>15</integer>
+</dict>
+</plist>
diff --git a/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
new file mode 100644
index 0000000..e53108c
--- /dev/null
+++ b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
@@ -0,0 +1,55 @@
+FROM /Users/aodhan/gitea/homeai/homeai-llm/modelfiles/lmstudio-community/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q8_0.gguf
+
+TEMPLATE """{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {"name": "{{ .Function.Name }}", "description": "{{ .Function.Description }}", "parameters": {{ .Function.Parameters }}}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{- end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{- if .ToolCalls }}
+{{- range .ToolCalls }}
+<tool_call>
+{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+</tool_call>
+{{- end }}
+{{- else }}{{ .Content }}
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- end }}<|im_start|>assistant
+"""
+
+SYSTEM You are a helpful AI assistant.
+PARAMETER num_ctx 32768
+PARAMETER stop <|im_end|>
+PARAMETER stop <|endoftext|>
+PARAMETER temperature 0.6
+PARAMETER top_p 0.95
+PARAMETER presence_penalty 1.5
+PARAMETER top_k 20
diff --git a/homeai-llm/scripts/preload-models.sh b/homeai-llm/scripts/preload-models.sh
new file mode 100755
index 0000000..85e0186
--- /dev/null
+++ b/homeai-llm/scripts/preload-models.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Pre-load voice pipeline models into Ollama with infinite keep_alive.
+# Run after Ollama starts (called by launchd or manually).
+# Only pins lightweight/MoE models — large dense models (70B) use default expiry.
+
+OLLAMA_URL="http://localhost:11434"
+
+# Wait for Ollama to be ready
+for i in $(seq 1 30); do
+    curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
+    sleep 2
+done
+
+# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
+echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
+curl -sf "$OLLAMA_URL/api/generate" \
+    -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
+    > /dev/null 2>&1
+echo "[preload] qwen3.5:35b-a3b pinned in memory"
diff --git a/homeai-voice/scripts/benchmark_pipeline.py b/homeai-voice/scripts/benchmark_pipeline.py
new file mode 100644
index 0000000..21296ca
--- /dev/null
+++ b/homeai-voice/scripts/benchmark_pipeline.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Voice Pipeline Benchmark
+
+Measures latency of each stage independently:
+  1. STT  (Wyoming Whisper, port 10300)
+  2. LLM  (Ollama API, port 11434) — multiple models
+  3. TTS  (Wyoming Kokoro, port 10301)
+  4. End-to-end via OpenClaw HTTP Bridge (port 8081)
+
+Usage:
+    python benchmark_pipeline.py [--rounds 3] [--models llama3.3:70b,qwen3:32b]
+"""
+
+import argparse
+import asyncio
+import io
+import json
+import statistics
+import sys
+import time
+import wave
+from urllib.request import Request, urlopen
+from urllib.error import URLError
+
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.asr import Transcribe, Transcript
+from wyoming.client import AsyncTcpClient
+from wyoming.tts import Synthesize, SynthesizeVoice
+
+# --- Config ---
+STT_HOST, STT_PORT = "127.0.0.1", 10300
+TTS_HOST, TTS_PORT = "127.0.0.1", 10301
+OLLAMA_URL = "http://localhost:11434"
+BRIDGE_URL = "http://localhost:8081"
+
+TEST_PROMPTS = [
+    "What is the capital of France?",
+    "Turn on the living room lights.",
+    "What's the weather like today?",
+]
+
+LONG_PROMPT = "Explain in two sentences how a heat pump works."
+
+
+# --- Helpers ---
+
+def http_post_json(url: str, data: dict, timeout: int = 180) -> tuple[dict, float]:
+    """POST JSON, return (response_dict, elapsed_seconds)."""
+    body = json.dumps(data).encode()
+    req = Request(url, data=body, headers={"Content-Type": "application/json"})
+    t0 = time.perf_counter()
+    resp = urlopen(req, timeout=timeout)
+    raw = resp.read()
+    elapsed = time.perf_counter() - t0
+    return json.loads(raw), elapsed
+
+
+def http_post_raw(url: str, data: bytes, content_type: str, timeout: int = 180) -> tuple[bytes, float]:
+    """POST raw bytes, return (response_bytes, elapsed_seconds)."""
+    req = Request(url, data=data, headers={"Content-Type": content_type})
+    t0 = time.perf_counter()
+    resp = urlopen(req, timeout=timeout)
+    raw = resp.read()
+    elapsed = time.perf_counter() - t0
+    return raw, elapsed
+
+
+# --- Stage 1: TTS ---
+
+async def benchmark_tts(text: str) -> tuple[bytes, float]:
+    """Synthesize text via Wyoming TTS, return (wav_bytes, elapsed)."""
+    t0 = time.perf_counter()
+    client = AsyncTcpClient(TTS_HOST, TTS_PORT)
+    await client.connect()
+    await client.read_event()  # Info
+
+    await client.write_event(
+        Synthesize(text=text, voice=SynthesizeVoice(name="af_heart")).event()
+    )
+
+    audio_data = bytearray()
+    rate, width, channels = 24000, 2, 1
+
+    while True:
+        event = await client.read_event()
+        if event is None:
+            break
+        if AudioStart.is_type(event.type):
+            start = AudioStart.from_event(event)
+            rate, width, channels = start.rate, start.width, start.channels
+        elif AudioChunk.is_type(event.type):
+            audio_data.extend(AudioChunk.from_event(event).audio)
+        elif AudioStop.is_type(event.type):
+            break
+
+    await client.disconnect()
+    elapsed = time.perf_counter() - t0
+
+    # Package as WAV
+    wav_io = io.BytesIO()
+    with wave.open(wav_io, "wb") as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(width)
+        wf.setframerate(rate)
+        wf.writeframes(audio_data)
+
+    duration_s = len(audio_data) / (rate * width * channels)
+    return wav_io.getvalue(), elapsed, duration_s
+
+
+# --- Stage 2: STT ---
+
+async def benchmark_stt(wav_bytes: bytes) -> tuple[str, float]:
+    """Transcribe WAV via Wyoming STT, return (text, elapsed)."""
+    wav_io = io.BytesIO(wav_bytes)
+    with wave.open(wav_io, "rb") as wf:
+        rate = wf.getframerate()
+        width = wf.getsampwidth()
+        channels = wf.getnchannels()
+        pcm = wf.readframes(wf.getnframes())
+
+    t0 = time.perf_counter()
+    client = AsyncTcpClient(STT_HOST, STT_PORT)
+    await client.connect()
+
+    await client.write_event(Transcribe(language="en").event())
+    await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
+
+    chunk_size = rate * width * channels  # 1 second
+    for off in range(0, len(pcm), chunk_size):
+        await client.write_event(
+            AudioChunk(rate=rate, width=width, channels=channels, audio=pcm[off:off + chunk_size]).event()
+        )
+    await client.write_event(AudioStop().event())
+
+    text = ""
+    while True:
+        event = await client.read_event()
+        if event is None:
+            break
+        if Transcript.is_type(event.type):
+            text = Transcript.from_event(event).text
+            break
+
+    await client.disconnect()
+    elapsed = time.perf_counter() - t0
+    return text, elapsed
+
+
+# --- Stage 3: LLM ---
+
+def benchmark_llm(model: str, prompt: str, warm: bool = False) -> dict:
+    """
+    Call Ollama /api/generate, return timing breakdown.
+    If warm=True, we assume the model is already loaded.
+    """
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"num_ctx": 2048},  # small ctx for benchmark speed
+    }
+    body = json.dumps(data).encode()
+    req = Request(
+        f"{OLLAMA_URL}/api/generate",
+        data=body,
+        headers={"Content-Type": "application/json"},
+    )
+
+    t0 = time.perf_counter()
+    resp = urlopen(req, timeout=300)
+    raw = resp.read()
+    wall_time = time.perf_counter() - t0
+
+    result = json.loads(raw)
+
+    # Ollama returns timing in nanoseconds
+    load_ns = result.get("load_duration", 0)
+    prompt_ns = result.get("prompt_eval_duration", 0)
+    eval_ns = result.get("eval_duration", 0)
+    total_ns = result.get("total_duration", 0)
+    prompt_tokens = result.get("prompt_eval_count", 0)
+    eval_tokens = result.get("eval_count", 0)
+
+    return {
+        "model": model,
+        "wall_time_s": wall_time,
+        "load_s": load_ns / 1e9,
+        "prompt_eval_s": prompt_ns / 1e9,
+        "eval_s": eval_ns / 1e9,
+        "total_s": total_ns / 1e9,
+        "prompt_tokens": prompt_tokens,
+        "eval_tokens": eval_tokens,
+        "tokens_per_sec": eval_tokens / (eval_ns / 1e9) if eval_ns > 0 else 0,
+        "prompt_tokens_per_sec": prompt_tokens / (prompt_ns / 1e9) if prompt_ns > 0 else 0,
+        "response": result.get("response", "")[:200],
+    }
+
+
+def warm_model(model: str):
+    """Send a tiny request to load the model into GPU memory."""
+    print(f"  Warming up {model}...", end=" ", flush=True)
+    try:
+        data = json.dumps({"model": model, "prompt": "hi", "stream": False, "options": {"num_ctx": 512}}).encode()
+        req = Request(f"{OLLAMA_URL}/api/generate", data=data, headers={"Content-Type": "application/json"})
+        urlopen(req, timeout=300).read()
+        print("ready")
+    except Exception as e:
+        print(f"warning: {e}")
+
+
+# --- Stage 4: End-to-end via bridge ---
+
+def benchmark_e2e(message: str) -> tuple[str, float]:
+    """Call the OpenClaw HTTP bridge end-to-end."""
+    data = {"message": message, "agent": "main"}
+    resp, elapsed = http_post_json(f"{BRIDGE_URL}/api/agent/message", data, timeout=300)
+    return resp.get("response", ""), elapsed
+
+
+# --- Formatting ---
+
+def fmt_time(seconds: float) -> str:
+    if seconds < 1:
+        return f"{seconds*1000:.0f}ms"
+    return f"{seconds:.1f}s"
+
+
+def print_table(rows: list[dict], columns: list[tuple[str, str, int]]):
+    """Print a formatted table. columns = [(header, key, width), ...]"""
+    header = " | ".join(h.ljust(w) for h, _, w in columns)
+    print(header)
+    print("-" * len(header))
+    for row in rows:
+        line = " | ".join(str(row.get(k, "")).ljust(w) for _, k, w in columns)
+        print(line)
+
+
+# --- Main ---
+
+def main():
+    parser = argparse.ArgumentParser(description="Voice Pipeline Benchmark")
+    parser.add_argument("--rounds", type=int, default=2, help="Rounds per test (default: 2)")
+    parser.add_argument(
+        "--models",
+        default="qwen2.5:7b,qwen3:32b,llama3.3:70b",
+        help="Comma-separated Ollama models to test",
+    )
+    parser.add_argument("--skip-stt", action="store_true", help="Skip STT benchmark")
+    parser.add_argument("--skip-tts", action="store_true", help="Skip TTS benchmark")
+    parser.add_argument("--skip-llm", action="store_true", help="Skip LLM benchmark")
+    parser.add_argument("--skip-e2e", action="store_true", help="Skip end-to-end benchmark")
+    parser.add_argument("--prompt", default=None, help="Custom prompt for LLM benchmark")
+    args = parser.parse_args()
+
+    models = [m.strip() for m in args.models.split(",")]
+    llm_prompt = args.prompt or LONG_PROMPT
+
+    print("=" * 70)
+    print("  VOICE PIPELINE BENCHMARK")
+    print("=" * 70)
+    print(f"  Rounds: {args.rounds}")
+    print(f"  Models: {', '.join(models)}")
+    print(f"  LLM prompt: {llm_prompt!r}")
+    print()
+
+    # ── TTS Benchmark ──
+    test_wav = None
+    if not args.skip_tts:
+        print("── TTS (Kokoro, Wyoming port 10301) ──")
+        tts_times = []
+        tts_durations = []
+        for i in range(args.rounds):
+            text = TEST_PROMPTS[i % len(TEST_PROMPTS)]
+            wav, elapsed, audio_dur = asyncio.run(benchmark_tts(text))
+            tts_times.append(elapsed)
+            tts_durations.append(audio_dur)
+            test_wav = wav
+            print(f"  Round {i+1}: {fmt_time(elapsed)} → {audio_dur:.1f}s audio  (RTF: {elapsed/audio_dur:.2f}x)  text={text!r}")
+
+        avg_tts = statistics.mean(tts_times)
+        avg_dur = statistics.mean(tts_durations)
+        print(f"  Average: {fmt_time(avg_tts)} for {avg_dur:.1f}s audio (RTF: {avg_tts/avg_dur:.2f}x)")
+        print()
+
+    # ── STT Benchmark ──
+    if not args.skip_stt:
+        print("── STT (Whisper large-v3, Wyoming port 10300) ──")
+        if test_wav is None:
+            # Generate a test WAV first
+            print("  Generating test audio via TTS...")
+            test_wav, _, _ = asyncio.run(benchmark_tts("The quick brown fox jumps over the lazy dog."))
+
+        stt_times = []
+        for i in range(args.rounds):
+            text, elapsed = asyncio.run(benchmark_stt(test_wav))
+            stt_times.append(elapsed)
+            print(f"  Round {i+1}: {fmt_time(elapsed)} → {text!r}")
+
+        print(f"  Average: {fmt_time(statistics.mean(stt_times))}")
+        print()
+
+    # ── LLM Benchmark ──
+    if not args.skip_llm:
+        print("── LLM (Ollama) ──")
+        print(f"  Prompt: {llm_prompt!r}")
+        print()
+
+        all_results = []
+        for model in models:
+            print(f"  Model: {model}")
+            warm_model(model)
+
+            model_runs = []
+            for i in range(args.rounds):
+                result = benchmark_llm(model, llm_prompt, warm=True)
+                model_runs.append(result)
+                print(
+                    f"    Round {i+1}: wall={fmt_time(result['wall_time_s'])}  "
+                    f"load={fmt_time(result['load_s'])}  "
+                    f"prompt_eval={fmt_time(result['prompt_eval_s'])} ({result['prompt_tokens']}tok, {result['prompt_tokens_per_sec']:.0f}t/s)  "
+                    f"gen={fmt_time(result['eval_s'])} ({result['eval_tokens']}tok, {result['tokens_per_sec']:.1f}t/s)"
+                )
+                # Truncate response for display
+                resp_preview = result["response"][:100].replace("\n", " ")
+                print(f"           → {resp_preview}")
+
+            # Summarize
+            avg_wall = statistics.mean(r["wall_time_s"] for r in model_runs)
+            avg_tps = statistics.mean(r["tokens_per_sec"] for r in model_runs)
+            avg_prompt_tps = statistics.mean(r["prompt_tokens_per_sec"] for r in model_runs)
+            avg_tokens = statistics.mean(r["eval_tokens"] for r in model_runs)
+            all_results.append({
+                "model": model,
+                "avg_wall": fmt_time(avg_wall),
+                "avg_gen_tps": f"{avg_tps:.1f}",
+                "avg_prompt_tps": f"{avg_prompt_tps:.0f}",
+                "avg_tokens": f"{avg_tokens:.0f}",
+            })
+            print()
+
+        # Summary table
+        print("  ┌─ LLM Summary ─────────────────────────────────────────────┐")
+        print(f"  {'Model':<25s} {'Wall time':>10s} {'Gen t/s':>10s} {'Prompt t/s':>11s} {'Avg tokens':>11s}")
+        print(f"  {'─'*25} {'─'*10} {'─'*10} {'─'*11} {'─'*11}")
+        for r in all_results:
+            print(f"  {r['model']:<25s} {r['avg_wall']:>10s} {r['avg_gen_tps']:>10s} {r['avg_prompt_tps']:>11s} {r['avg_tokens']:>11s}")
+        print()
+
+    # ── End-to-end ──
+    if not args.skip_e2e:
+        print("── End-to-End (Bridge → OpenClaw → Ollama → response) ──")
+        print("  (Does not include STT/TTS — just text in → text out via bridge)")
+        e2e_prompt = "What time is it?"
+        for i in range(args.rounds):
+            try:
+                resp, elapsed = benchmark_e2e(e2e_prompt)
+                preview = resp[:100].replace("\n", " ")
+                print(f"  Round {i+1}: {fmt_time(elapsed)} → {preview}")
+            except Exception as e:
+                print(f"  Round {i+1}: ERROR - {e}")
+        print()
+
+    # ── Pipeline estimate ──
+    print("=" * 70)
+    print("  ESTIMATED PIPELINE LATENCY (per voice interaction)")
+    print("=" * 70)
+    print("  wake word detection  ~instant (runs locally)")
+    print("  + STT (Whisper)      see above")
+    print("  + LLM (inference)    see above (dominant cost)")
+    print("  + TTS (Kokoro)       see above")
+    print("  ─────────────────────────────────────")
+    print("  Tip: smaller models (7B, 32B) dramatically reduce LLM latency.")
+    print("  The 70B model at ~12 tok/s needs ~5-8s for a typical reply.")
+    print("  A 7B model at ~80 tok/s would need <1s for the same reply.")
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
index e7e59b1..94637f2 100644
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
@@ -8,21 +8,11 @@
 
   <key>ProgramArguments</key>
   <array>
-    <string>/Users/aodhan/homeai-voice-env/bin/wyoming-faster-whisper</string>
+    <string>/Users/aodhan/homeai-whisper-mlx-env/bin/wyoming-mlx-whisper</string>
     <string>--uri</string>
     <string>tcp://0.0.0.0:10300</string>
-    <string>--model</string>
-    <string>large-v3</string>
     <string>--language</string>
     <string>en</string>
-    <string>--device</string>
-    <string>cpu</string>
-    <string>--compute-type</string>
-    <string>int8</string>
-    <string>--data-dir</string>
-    <string>/Users/aodhan/models/whisper</string>
-    <string>--download-dir</string>
-    <string>/Users/aodhan/models/whisper</string>
   </array>
 
   <key>RunAtLoad</key>