diff --git a/.env.example b/.env.example
index 87eb9c2..c58eddb 100644
--- a/.env.example
+++ b/.env.example
@@ -2,6 +2,14 @@
# Copy to .env and fill in your values.
# .env is gitignored — never commit it.
+# ─── API Keys ──────────────────────────────────────────────────────────────────
+HUGGING_FACE_API_KEY=
+OPENROUTER_API_KEY=
+OPENAI_API_KEY=
+DEEPSEEK_API_KEY=
+GEMINI_API_KEY=
+ELEVENLABS_API_KEY=
+
# ─── Data & Paths ──────────────────────────────────────────────────────────────
DATA_DIR=${HOME}/homeai-data
REPO_DIR=${HOME}/Projects/HomeAI
@@ -45,3 +53,4 @@ VTUBE_WS_URL=ws://localhost:8001
# ─── P8: Images ────────────────────────────────────────────────────────────────
COMFYUI_URL=http://localhost:8188
+
diff --git a/TODO.md b/TODO.md
index 95a1f48..e255244 100644
--- a/TODO.md
+++ b/TODO.md
@@ -25,9 +25,11 @@
- [x] Write and load launchd plist (`com.homeai.ollama.plist`) — `/opt/homebrew/bin/ollama`
- [x] Register local GGUF models via Modelfiles (no download): llama3.3:70b, qwen3:32b, codestral:22b, qwen2.5:7b
- [x] Register additional models: EVA-LLaMA-3.33-70B, Midnight-Miqu-70B, QwQ-32B, Qwen3.5-35B, Qwen3-Coder-30B, Qwen3-VL-30B, GLM-4.6V-Flash, DeepSeek-R1-8B, gemma-3-27b
+- [x] Add qwen3.5:35b-a3b (MoE, Q8_0) — 26.7 tok/s, recommended for voice pipeline
+- [x] Write model preload script + launchd service (keeps voice model in VRAM permanently)
- [x] Deploy Open WebUI via Docker compose (port 3030)
- [x] Verify Open WebUI connected to Ollama, all models available
-- [ ] Run `scripts/benchmark.sh` — record results in `benchmark-results.md`
+- [x] Run pipeline benchmark (homeai-voice/scripts/benchmark_pipeline.py) — STT/LLM/TTS latency profiled
- [ ] Add Ollama + Open WebUI to Uptime Kuma monitors
---
@@ -37,6 +39,7 @@
### P3 · homeai-voice
- [x] Install `wyoming-faster-whisper` — model: faster-whisper-large-v3 (auto-downloaded)
+- [x] Upgrade STT to wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) — 20x faster (8s → 400ms)
- [x] Install Kokoro ONNX TTS — models at `~/models/kokoro/`
- [x] Write Wyoming-Kokoro adapter server (`homeai-voice/tts/wyoming_kokoro_server.py`)
- [x] Write + load launchd plists for Wyoming STT (10300) and TTS (10301)
@@ -67,10 +70,11 @@
- [x] Fix context window: set `contextWindow=32768` for llama3.3:70b in `openclaw.json`
- [x] Fix Llama 3.3 Modelfile: add tool-calling TEMPLATE block
- [x] Verify `openclaw agent --message "..." --agent main` → completed
-- [x] Write `skills/home-assistant` SKILL.md — HA REST API control
+- [x] Write `skills/home-assistant` SKILL.md — HA REST API control via ha-ctl CLI
- [x] Write `skills/voice-assistant` SKILL.md — voice response style guide
- [x] Wire HASS_TOKEN — create `~/.homeai/hass_token` or set env in launchd plist
-- [x] Test home-assistant skill: "turn on/off the reading lamp"
+- [x] Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH, update TOOLS.md
+- [x] Test home-assistant skill: "turn on/off the reading lamp" — verified exec→ha-ctl→HA action
- [x] Set up mem0 with Chroma backend, test semantic recall
- [x] Write memory backup launchd job
- [x] Build morning briefing n8n workflow
diff --git a/homeai-agent/openclaw-http-bridge.py b/homeai-agent/openclaw-http-bridge.py
index 1d3e3b2..e3cfbe1 100644
--- a/homeai-agent/openclaw-http-bridge.py
+++ b/homeai-agent/openclaw-http-bridge.py
@@ -34,6 +34,7 @@ import wave
import io
from wyoming.client import AsyncTcpClient
from wyoming.tts import Synthesize, SynthesizeVoice
+from wyoming.asr import Transcribe, Transcript
from wyoming.audio import AudioStart, AudioChunk, AudioStop
from wyoming.info import Info
@@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
self._handle_tts_request()
return
+ # Handle STT requests
+ if parsed_path.path == "/api/stt":
+ self._handle_stt_request()
+ return
+
# Only handle the agent message endpoint
if parsed_path.path == "/api/agent/message":
self._handle_agent_request()
@@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
return wav_io.getvalue()
+ def _handle_stt_request(self):
+ """Handle STT request — accept WAV audio, return transcribed text."""
+ content_length = int(self.headers.get("Content-Length", 0))
+ if content_length == 0:
+ self._send_json_response(400, {"error": "Empty body"})
+ return
+
+ try:
+ audio_bytes = self.rfile.read(content_length)
+
+ # Parse WAV to get PCM data and format
+ wav_io = io.BytesIO(audio_bytes)
+ with wave.open(wav_io, 'rb') as wav_file:
+ rate = wav_file.getframerate()
+ width = wav_file.getsampwidth()
+ channels = wav_file.getnchannels()
+ pcm_data = wav_file.readframes(wav_file.getnframes())
+
+ # Run the async Wyoming client
+ text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels))
+
+ self.send_response(200)
+ self.send_header("Content-Type", "application/json")
+ self.send_header("Access-Control-Allow-Origin", "*")
+ self.end_headers()
+ self.wfile.write(json.dumps({"text": text}).encode())
+
+ except wave.Error as e:
+ self._send_json_response(400, {"error": f"Invalid WAV: {e}"})
+ except Exception as e:
+ self._send_json_response(500, {"error": str(e)})
+
+ async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str:
+ """Connect to Wyoming STT server and transcribe audio."""
+ client = AsyncTcpClient("127.0.0.1", 10300)
+ await client.connect()
+
+ # Send Transcribe request (STT server does not send an initial Info event)
+ await client.write_event(Transcribe(language="en").event())
+
+ # Send audio
+ await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
+
+ # Send in chunks (1 second each)
+ bytes_per_second = rate * width * channels
+ for offset in range(0, len(pcm_data), bytes_per_second):
+ chunk = pcm_data[offset:offset + bytes_per_second]
+ await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event())
+
+ await client.write_event(AudioStop().event())
+
+ # Read transcript
+ while True:
+ event = await client.read_event()
+ if event is None:
+ break
+ if Transcript.is_type(event.type):
+ transcript = Transcript.from_event(event)
+ await client.disconnect()
+ return transcript.text
+
+ await client.disconnect()
+ return ""
+
def _handle_wake_word(self):
"""Handle wake word detection notification."""
content_length = int(self.headers.get("Content-Length", 0))
diff --git a/homeai-character/src/ServiceStatus.jsx b/homeai-character/src/ServiceStatus.jsx
index 0eaa861..6393204 100644
--- a/homeai-character/src/ServiceStatus.jsx
+++ b/homeai-character/src/ServiceStatus.jsx
@@ -8,6 +8,7 @@ const SERVICES = [
uiUrl: null,
description: 'Local LLM runtime',
category: 'AI & LLM',
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.ollama' },
},
{
name: 'Open WebUI',
@@ -16,6 +17,7 @@ const SERVICES = [
uiUrl: 'http://localhost:3030',
description: 'Chat interface',
category: 'AI & LLM',
+ restart: { type: 'docker', id: 'homeai-open-webui' },
},
{
name: 'OpenClaw Gateway',
@@ -24,6 +26,7 @@ const SERVICES = [
uiUrl: null,
description: 'Agent gateway',
category: 'Agent',
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw' },
},
{
name: 'OpenClaw Bridge',
@@ -32,6 +35,7 @@ const SERVICES = [
uiUrl: null,
description: 'HTTP-to-CLI bridge',
category: 'Agent',
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw-bridge' },
},
{
name: 'Wyoming STT',
@@ -41,6 +45,7 @@ const SERVICES = [
description: 'Whisper speech-to-text',
category: 'Voice',
tcp: true,
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-stt' },
},
{
name: 'Wyoming TTS',
@@ -50,6 +55,7 @@ const SERVICES = [
description: 'Kokoro text-to-speech',
category: 'Voice',
tcp: true,
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-tts' },
},
{
name: 'Wyoming Satellite',
@@ -59,6 +65,16 @@ const SERVICES = [
description: 'Mac Mini mic/speaker satellite',
category: 'Voice',
tcp: true,
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-satellite' },
+ },
+ {
+ name: 'Character Dashboard',
+ url: 'http://localhost:5173',
+ healthPath: '/',
+ uiUrl: 'http://localhost:5173',
+ description: 'Character manager & service status',
+ category: 'Agent',
+ restart: { type: 'launchd', id: 'gui/501/com.homeai.character-dashboard' },
},
{
name: 'Home Assistant',
@@ -75,6 +91,7 @@ const SERVICES = [
uiUrl: 'http://localhost:3001',
description: 'Service health monitoring',
category: 'Infrastructure',
+ restart: { type: 'docker', id: 'homeai-uptime-kuma' },
},
{
name: 'n8n',
@@ -83,6 +100,7 @@ const SERVICES = [
uiUrl: 'http://localhost:5678',
description: 'Workflow automation',
category: 'Infrastructure',
+ restart: { type: 'docker', id: 'homeai-n8n' },
},
{
name: 'code-server',
@@ -91,6 +109,7 @@ const SERVICES = [
uiUrl: 'http://localhost:8090',
description: 'Browser-based VS Code',
category: 'Infrastructure',
+ restart: { type: 'docker', id: 'homeai-code-server' },
},
{
name: 'Portainer',
@@ -155,6 +174,7 @@ export default function ServiceStatus() {
Object.fromEntries(SERVICES.map(s => [s.name, { status: 'checking', lastCheck: null, responseTime: null }]))
);
const [lastRefresh, setLastRefresh] = useState(null);
+ const [restarting, setRestarting] = useState({});
const checkService = useCallback(async (service) => {
try {
@@ -208,6 +228,31 @@ export default function ServiceStatus() {
return () => clearInterval(interval);
}, [refreshAll]);
+ const restartService = useCallback(async (service) => {
+ if (!service.restart) return;
+ setRestarting(prev => ({ ...prev, [service.name]: true }));
+ try {
+ const res = await fetch('/api/service/restart', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(service.restart),
+ });
+ const data = await res.json();
+ if (!data.ok) {
+ console.error(`Restart failed for ${service.name}:`, data.error);
+ }
+ // Wait a moment for the service to come back, then re-check
+ setTimeout(async () => {
+ const result = await checkService(service);
+ setStatuses(prev => ({ ...prev, [service.name]: result }));
+ setRestarting(prev => ({ ...prev, [service.name]: false }));
+ }, 3000);
+ } catch (err) {
+ console.error(`Restart failed for ${service.name}:`, err);
+ setRestarting(prev => ({ ...prev, [service.name]: false }));
+ }
+ }, [checkService]);
+
const categories = [...new Set(SERVICES.map(s => s.category))];
const onlineCount = Object.values(statuses).filter(s => s.status === 'online').length;
const offlineCount = Object.values(statuses).filter(s => s.status === 'offline').length;
@@ -293,19 +338,45 @@ export default function ServiceStatus() {
{st.responseTime}ms
)}
- {service.uiUrl && (
-
- Open
-
-
- )}
+
+ {service.restart && st.status === 'offline' && (
+
+ )}
+ {service.uiUrl && (
+
+ Open
+
+
+ )}
+
);
diff --git a/homeai-character/vite.config.js b/homeai-character/vite.config.js
index f56a79b..7b54513 100644
--- a/homeai-character/vite.config.js
+++ b/homeai-character/vite.config.js
@@ -53,6 +53,70 @@ function healthCheckPlugin() {
res.end(JSON.stringify({ status: 'offline', responseTime: null }));
}
});
+ // Service restart — runs launchctl or docker restart
+ server.middlewares.use('/api/service/restart', async (req, res) => {
+ if (req.method === 'OPTIONS') {
+ res.writeHead(204, { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'Content-Type' });
+ res.end();
+ return;
+ }
+ if (req.method !== 'POST') {
+ res.writeHead(405);
+ res.end();
+ return;
+ }
+ try {
+ const chunks = [];
+ for await (const chunk of req) chunks.push(chunk);
+ const { type, id } = JSON.parse(Buffer.concat(chunks).toString());
+
+ if (!type || !id) {
+ res.writeHead(400, { 'Content-Type': 'application/json' });
+ res.end(JSON.stringify({ ok: false, error: 'Missing type or id' }));
+ return;
+ }
+
+ // Whitelist valid service IDs to prevent command injection
+ const ALLOWED_LAUNCHD = [
+ 'gui/501/com.homeai.ollama',
+ 'gui/501/com.homeai.openclaw',
+ 'gui/501/com.homeai.openclaw-bridge',
+ 'gui/501/com.homeai.wyoming-stt',
+ 'gui/501/com.homeai.wyoming-tts',
+ 'gui/501/com.homeai.wyoming-satellite',
+ 'gui/501/com.homeai.character-dashboard',
+ ];
+ const ALLOWED_DOCKER = [
+ 'homeai-open-webui',
+ 'homeai-uptime-kuma',
+ 'homeai-n8n',
+ 'homeai-code-server',
+ ];
+
+ let cmd;
+ if (type === 'launchd' && ALLOWED_LAUNCHD.includes(id)) {
+ cmd = ['launchctl', 'kickstart', '-k', id];
+ } else if (type === 'docker' && ALLOWED_DOCKER.includes(id)) {
+ cmd = ['docker', 'restart', id];
+ } else {
+ res.writeHead(403, { 'Content-Type': 'application/json' });
+ res.end(JSON.stringify({ ok: false, error: 'Service not in allowed list' }));
+ return;
+ }
+
+ const { execFile } = await import('child_process');
+ const { promisify } = await import('util');
+ const execFileAsync = promisify(execFile);
+ const { stdout, stderr } = await execFileAsync(cmd[0], cmd.slice(1), { timeout: 30000 });
+
+ res.writeHead(200, { 'Content-Type': 'application/json' });
+ res.end(JSON.stringify({ ok: true, stdout: stdout.trim(), stderr: stderr.trim() }));
+ } catch (err) {
+ res.writeHead(500, { 'Content-Type': 'application/json' });
+ res.end(JSON.stringify({ ok: false, error: err.message }));
+ }
+ });
+
// TTS preview proxy — forwards POST to OpenClaw bridge, returns audio
server.middlewares.use('/api/tts', async (req, res) => {
if (req.method !== 'POST') {
@@ -99,4 +163,7 @@ export default defineConfig({
tailwindcss(),
react(),
],
+ server: {
+ host: '0.0.0.0',
+ },
})
diff --git a/homeai-llm/launchd/com.homeai.preload-models.plist b/homeai-llm/launchd/com.homeai.preload-models.plist
new file mode 100644
index 0000000..e7b209e
--- /dev/null
+++ b/homeai-llm/launchd/com.homeai.preload-models.plist
@@ -0,0 +1,28 @@
+
+
+
+
+ Label
+ com.homeai.preload-models
+
+ ProgramArguments
+
+ /bin/bash
+ /Users/aodhan/gitea/homeai/homeai-llm/scripts/preload-models.sh
+
+
+ RunAtLoad
+
+
+ StandardOutPath
+ /tmp/homeai-preload-models.log
+
+ StandardErrorPath
+ /tmp/homeai-preload-models-error.log
+
+
+ ThrottleInterval
+ 15
+
+
diff --git a/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
new file mode 100644
index 0000000..e53108c
--- /dev/null
+++ b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
@@ -0,0 +1,55 @@
+FROM /Users/aodhan/gitea/homeai/homeai-llm/modelfiles/lmstudio-community/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q8_0.gguf
+
+TEMPLATE """{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within XML tags:
+
+{{- range .Tools }}
+{"type": "function", "function": {"name": "{{ .Function.Name }}", "description": "{{ .Function.Description }}", "parameters": {{ .Function.Parameters }}}}
+{{- end }}
+
+
+For each function call, return a json object with function name and arguments within XML tags:
+
+{"name": , "arguments": }
+
+{{- end }}<|im_end|>
+{{- end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{- if .ToolCalls }}
+{{- range .ToolCalls }}
+
+{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+
+{{- end }}
+{{- else }}{{ .Content }}
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+
+{{ .Content }}
+<|im_end|>
+{{ end }}
+{{- end }}<|im_start|>assistant
+"""
+
+SYSTEM You are a helpful AI assistant.
+PARAMETER num_ctx 32768
+PARAMETER stop <|im_end|>
+PARAMETER stop <|endoftext|>
+PARAMETER temperature 0.6
+PARAMETER top_p 0.95
+PARAMETER presence_penalty 1.5
+PARAMETER top_k 20
diff --git a/homeai-llm/scripts/preload-models.sh b/homeai-llm/scripts/preload-models.sh
new file mode 100755
index 0000000..85e0186
--- /dev/null
+++ b/homeai-llm/scripts/preload-models.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Pre-load voice pipeline models into Ollama with infinite keep_alive.
+# Run after Ollama starts (called by launchd or manually).
+# Only pins lightweight/MoE models — large dense models (70B) use default expiry.
+
+OLLAMA_URL="http://localhost:11434"
+
+# Wait for Ollama to be ready
+for i in $(seq 1 30); do
+ curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
+ sleep 2
+done
+
+# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
+echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
+curl -sf "$OLLAMA_URL/api/generate" \
+ -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
+ > /dev/null 2>&1
+echo "[preload] qwen3.5:35b-a3b pinned in memory"
diff --git a/homeai-voice/scripts/benchmark_pipeline.py b/homeai-voice/scripts/benchmark_pipeline.py
new file mode 100644
index 0000000..21296ca
--- /dev/null
+++ b/homeai-voice/scripts/benchmark_pipeline.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Voice Pipeline Benchmark
+
+Measures latency of each stage independently:
+ 1. STT (Wyoming Whisper, port 10300)
+ 2. LLM (Ollama API, port 11434) — multiple models
+ 3. TTS (Wyoming Kokoro, port 10301)
+ 4. End-to-end via OpenClaw HTTP Bridge (port 8081)
+
+Usage:
+ python benchmark_pipeline.py [--rounds 3] [--models llama3.3:70b,qwen3:32b]
+"""
+
+import argparse
+import asyncio
+import io
+import json
+import statistics
+import sys
+import time
+import wave
+from urllib.request import Request, urlopen
+from urllib.error import URLError
+
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.asr import Transcribe, Transcript
+from wyoming.client import AsyncTcpClient
+from wyoming.tts import Synthesize, SynthesizeVoice
+
+# --- Config ---
+STT_HOST, STT_PORT = "127.0.0.1", 10300
+TTS_HOST, TTS_PORT = "127.0.0.1", 10301
+OLLAMA_URL = "http://localhost:11434"
+BRIDGE_URL = "http://localhost:8081"
+
+TEST_PROMPTS = [
+ "What is the capital of France?",
+ "Turn on the living room lights.",
+ "What's the weather like today?",
+]
+
+LONG_PROMPT = "Explain in two sentences how a heat pump works."
+
+
+# --- Helpers ---
+
+def http_post_json(url: str, data: dict, timeout: int = 180) -> tuple[dict, float]:
+ """POST JSON, return (response_dict, elapsed_seconds)."""
+ body = json.dumps(data).encode()
+ req = Request(url, data=body, headers={"Content-Type": "application/json"})
+ t0 = time.perf_counter()
+ resp = urlopen(req, timeout=timeout)
+ raw = resp.read()
+ elapsed = time.perf_counter() - t0
+ return json.loads(raw), elapsed
+
+
+def http_post_raw(url: str, data: bytes, content_type: str, timeout: int = 180) -> tuple[bytes, float]:
+ """POST raw bytes, return (response_bytes, elapsed_seconds)."""
+ req = Request(url, data=data, headers={"Content-Type": content_type})
+ t0 = time.perf_counter()
+ resp = urlopen(req, timeout=timeout)
+ raw = resp.read()
+ elapsed = time.perf_counter() - t0
+ return raw, elapsed
+
+
+# --- Stage 1: TTS ---
+
+async def benchmark_tts(text: str) -> tuple[bytes, float]:
+ """Synthesize text via Wyoming TTS, return (wav_bytes, elapsed)."""
+ t0 = time.perf_counter()
+ client = AsyncTcpClient(TTS_HOST, TTS_PORT)
+ await client.connect()
+ await client.read_event() # Info
+
+ await client.write_event(
+ Synthesize(text=text, voice=SynthesizeVoice(name="af_heart")).event()
+ )
+
+ audio_data = bytearray()
+ rate, width, channels = 24000, 2, 1
+
+ while True:
+ event = await client.read_event()
+ if event is None:
+ break
+ if AudioStart.is_type(event.type):
+ start = AudioStart.from_event(event)
+ rate, width, channels = start.rate, start.width, start.channels
+ elif AudioChunk.is_type(event.type):
+ audio_data.extend(AudioChunk.from_event(event).audio)
+ elif AudioStop.is_type(event.type):
+ break
+
+ await client.disconnect()
+ elapsed = time.perf_counter() - t0
+
+ # Package as WAV
+ wav_io = io.BytesIO()
+ with wave.open(wav_io, "wb") as wf:
+ wf.setnchannels(channels)
+ wf.setsampwidth(width)
+ wf.setframerate(rate)
+ wf.writeframes(audio_data)
+
+ duration_s = len(audio_data) / (rate * width * channels)
+ return wav_io.getvalue(), elapsed, duration_s
+
+
+# --- Stage 2: STT ---
+
+async def benchmark_stt(wav_bytes: bytes) -> tuple[str, float]:
+ """Transcribe WAV via Wyoming STT, return (text, elapsed)."""
+ wav_io = io.BytesIO(wav_bytes)
+ with wave.open(wav_io, "rb") as wf:
+ rate = wf.getframerate()
+ width = wf.getsampwidth()
+ channels = wf.getnchannels()
+ pcm = wf.readframes(wf.getnframes())
+
+ t0 = time.perf_counter()
+ client = AsyncTcpClient(STT_HOST, STT_PORT)
+ await client.connect()
+
+ await client.write_event(Transcribe(language="en").event())
+ await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())
+
+ chunk_size = rate * width * channels # 1 second
+ for off in range(0, len(pcm), chunk_size):
+ await client.write_event(
+ AudioChunk(rate=rate, width=width, channels=channels, audio=pcm[off:off + chunk_size]).event()
+ )
+ await client.write_event(AudioStop().event())
+
+ text = ""
+ while True:
+ event = await client.read_event()
+ if event is None:
+ break
+ if Transcript.is_type(event.type):
+ text = Transcript.from_event(event).text
+ break
+
+ await client.disconnect()
+ elapsed = time.perf_counter() - t0
+ return text, elapsed
+
+
+# --- Stage 3: LLM ---
+
+def benchmark_llm(model: str, prompt: str, warm: bool = False) -> dict:
+ """
+ Call Ollama /api/generate, return timing breakdown.
+ If warm=True, we assume the model is already loaded.
+ """
+ data = {
+ "model": model,
+ "prompt": prompt,
+ "stream": False,
+ "options": {"num_ctx": 2048}, # small ctx for benchmark speed
+ }
+ body = json.dumps(data).encode()
+ req = Request(
+ f"{OLLAMA_URL}/api/generate",
+ data=body,
+ headers={"Content-Type": "application/json"},
+ )
+
+ t0 = time.perf_counter()
+ resp = urlopen(req, timeout=300)
+ raw = resp.read()
+ wall_time = time.perf_counter() - t0
+
+ result = json.loads(raw)
+
+ # Ollama returns timing in nanoseconds
+ load_ns = result.get("load_duration", 0)
+ prompt_ns = result.get("prompt_eval_duration", 0)
+ eval_ns = result.get("eval_duration", 0)
+ total_ns = result.get("total_duration", 0)
+ prompt_tokens = result.get("prompt_eval_count", 0)
+ eval_tokens = result.get("eval_count", 0)
+
+ return {
+ "model": model,
+ "wall_time_s": wall_time,
+ "load_s": load_ns / 1e9,
+ "prompt_eval_s": prompt_ns / 1e9,
+ "eval_s": eval_ns / 1e9,
+ "total_s": total_ns / 1e9,
+ "prompt_tokens": prompt_tokens,
+ "eval_tokens": eval_tokens,
+ "tokens_per_sec": eval_tokens / (eval_ns / 1e9) if eval_ns > 0 else 0,
+ "prompt_tokens_per_sec": prompt_tokens / (prompt_ns / 1e9) if prompt_ns > 0 else 0,
+ "response": result.get("response", "")[:200],
+ }
+
+
+def warm_model(model: str):
+ """Send a tiny request to load the model into GPU memory."""
+ print(f" Warming up {model}...", end=" ", flush=True)
+ try:
+ data = json.dumps({"model": model, "prompt": "hi", "stream": False, "options": {"num_ctx": 512}}).encode()
+ req = Request(f"{OLLAMA_URL}/api/generate", data=data, headers={"Content-Type": "application/json"})
+ urlopen(req, timeout=300).read()
+ print("ready")
+ except Exception as e:
+ print(f"warning: {e}")
+
+
+# --- Stage 4: End-to-end via bridge ---
+
+def benchmark_e2e(message: str) -> tuple[str, float]:
+ """Call the OpenClaw HTTP bridge end-to-end."""
+ data = {"message": message, "agent": "main"}
+ resp, elapsed = http_post_json(f"{BRIDGE_URL}/api/agent/message", data, timeout=300)
+ return resp.get("response", ""), elapsed
+
+
+# --- Formatting ---
+
+def fmt_time(seconds: float) -> str:
+ if seconds < 1:
+ return f"{seconds*1000:.0f}ms"
+ return f"{seconds:.1f}s"
+
+
+def print_table(rows: list[dict], columns: list[tuple[str, str, int]]):
+ """Print a formatted table. columns = [(header, key, width), ...]"""
+ header = " | ".join(h.ljust(w) for h, _, w in columns)
+ print(header)
+ print("-" * len(header))
+ for row in rows:
+ line = " | ".join(str(row.get(k, "")).ljust(w) for _, k, w in columns)
+ print(line)
+
+
+# --- Main ---
+
+def main():
+ parser = argparse.ArgumentParser(description="Voice Pipeline Benchmark")
+ parser.add_argument("--rounds", type=int, default=2, help="Rounds per test (default: 2)")
+ parser.add_argument(
+ "--models",
+ default="qwen2.5:7b,qwen3:32b,llama3.3:70b",
+ help="Comma-separated Ollama models to test",
+ )
+ parser.add_argument("--skip-stt", action="store_true", help="Skip STT benchmark")
+ parser.add_argument("--skip-tts", action="store_true", help="Skip TTS benchmark")
+ parser.add_argument("--skip-llm", action="store_true", help="Skip LLM benchmark")
+ parser.add_argument("--skip-e2e", action="store_true", help="Skip end-to-end benchmark")
+ parser.add_argument("--prompt", default=None, help="Custom prompt for LLM benchmark")
+ args = parser.parse_args()
+
+ models = [m.strip() for m in args.models.split(",")]
+ llm_prompt = args.prompt or LONG_PROMPT
+
+ print("=" * 70)
+ print(" VOICE PIPELINE BENCHMARK")
+ print("=" * 70)
+ print(f" Rounds: {args.rounds}")
+ print(f" Models: {', '.join(models)}")
+ print(f" LLM prompt: {llm_prompt!r}")
+ print()
+
+ # ── TTS Benchmark ──
+ test_wav = None
+ if not args.skip_tts:
+ print("── TTS (Kokoro, Wyoming port 10301) ──")
+ tts_times = []
+ tts_durations = []
+ for i in range(args.rounds):
+ text = TEST_PROMPTS[i % len(TEST_PROMPTS)]
+ wav, elapsed, audio_dur = asyncio.run(benchmark_tts(text))
+ tts_times.append(elapsed)
+ tts_durations.append(audio_dur)
+ test_wav = wav
+ print(f" Round {i+1}: {fmt_time(elapsed)} → {audio_dur:.1f}s audio (RTF: {elapsed/audio_dur:.2f}x) text={text!r}")
+
+ avg_tts = statistics.mean(tts_times)
+ avg_dur = statistics.mean(tts_durations)
+ print(f" Average: {fmt_time(avg_tts)} for {avg_dur:.1f}s audio (RTF: {avg_tts/avg_dur:.2f}x)")
+ print()
+
+ # ── STT Benchmark ──
+ if not args.skip_stt:
+ print("── STT (Whisper large-v3, Wyoming port 10300) ──")
+ if test_wav is None:
+ # Generate a test WAV first
+ print(" Generating test audio via TTS...")
+ test_wav, _, _ = asyncio.run(benchmark_tts("The quick brown fox jumps over the lazy dog."))
+
+ stt_times = []
+ for i in range(args.rounds):
+ text, elapsed = asyncio.run(benchmark_stt(test_wav))
+ stt_times.append(elapsed)
+ print(f" Round {i+1}: {fmt_time(elapsed)} → {text!r}")
+
+ print(f" Average: {fmt_time(statistics.mean(stt_times))}")
+ print()
+
+ # ── LLM Benchmark ──
+ if not args.skip_llm:
+ print("── LLM (Ollama) ──")
+ print(f" Prompt: {llm_prompt!r}")
+ print()
+
+ all_results = []
+ for model in models:
+ print(f" Model: {model}")
+ warm_model(model)
+
+ model_runs = []
+ for i in range(args.rounds):
+ result = benchmark_llm(model, llm_prompt, warm=True)
+ model_runs.append(result)
+ print(
+ f" Round {i+1}: wall={fmt_time(result['wall_time_s'])} "
+ f"load={fmt_time(result['load_s'])} "
+ f"prompt_eval={fmt_time(result['prompt_eval_s'])} ({result['prompt_tokens']}tok, {result['prompt_tokens_per_sec']:.0f}t/s) "
+ f"gen={fmt_time(result['eval_s'])} ({result['eval_tokens']}tok, {result['tokens_per_sec']:.1f}t/s)"
+ )
+ # Truncate response for display
+ resp_preview = result["response"][:100].replace("\n", " ")
+ print(f" → {resp_preview}")
+
+ # Summarize
+ avg_wall = statistics.mean(r["wall_time_s"] for r in model_runs)
+ avg_tps = statistics.mean(r["tokens_per_sec"] for r in model_runs)
+ avg_prompt_tps = statistics.mean(r["prompt_tokens_per_sec"] for r in model_runs)
+ avg_tokens = statistics.mean(r["eval_tokens"] for r in model_runs)
+ all_results.append({
+ "model": model,
+ "avg_wall": fmt_time(avg_wall),
+ "avg_gen_tps": f"{avg_tps:.1f}",
+ "avg_prompt_tps": f"{avg_prompt_tps:.0f}",
+ "avg_tokens": f"{avg_tokens:.0f}",
+ })
+ print()
+
+ # Summary table
+ print(" ┌─ LLM Summary ─────────────────────────────────────────────┐")
+ print(f" {'Model':<25s} {'Wall time':>10s} {'Gen t/s':>10s} {'Prompt t/s':>11s} {'Avg tokens':>11s}")
+ print(f" {'─'*25} {'─'*10} {'─'*10} {'─'*11} {'─'*11}")
+ for r in all_results:
+ print(f" {r['model']:<25s} {r['avg_wall']:>10s} {r['avg_gen_tps']:>10s} {r['avg_prompt_tps']:>11s} {r['avg_tokens']:>11s}")
+ print()
+
+ # ── End-to-end ──
+ if not args.skip_e2e:
+ print("── End-to-End (Bridge → OpenClaw → Ollama → response) ──")
+ print(" (Does not include STT/TTS — just text in → text out via bridge)")
+ e2e_prompt = "What time is it?"
+ for i in range(args.rounds):
+ try:
+ resp, elapsed = benchmark_e2e(e2e_prompt)
+ preview = resp[:100].replace("\n", " ")
+ print(f" Round {i+1}: {fmt_time(elapsed)} → {preview}")
+ except Exception as e:
+ print(f" Round {i+1}: ERROR - {e}")
+ print()
+
+ # ── Pipeline estimate ──
+ print("=" * 70)
+ print(" ESTIMATED PIPELINE LATENCY (per voice interaction)")
+ print("=" * 70)
+ print(" wake word detection ~instant (runs locally)")
+ print(" + STT (Whisper) see above")
+ print(" + LLM (inference) see above (dominant cost)")
+ print(" + TTS (Kokoro) see above")
+ print(" ─────────────────────────────────────")
+ print(" Tip: smaller models (7B, 32B) dramatically reduce LLM latency.")
+ print(" The 70B model at ~12 tok/s needs ~5-8s for a typical reply.")
+ print(" A 7B model at ~80 tok/s would need <1s for the same reply.")
+ print()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
index e7e59b1..94637f2 100644
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
@@ -8,21 +8,11 @@
ProgramArguments
- /Users/aodhan/homeai-voice-env/bin/wyoming-faster-whisper
+ /Users/aodhan/homeai-whisper-mlx-env/bin/wyoming-mlx-whisper
--uri
tcp://0.0.0.0:10300
- --model
- large-v3
--language
en
- --device
- cpu
- --compute-type
- int8
- --data-dir
- /Users/aodhan/models/whisper
- --download-dir
- /Users/aodhan/models/whisper
RunAtLoad