From af6b7bd94508bfd10d65450957584e99e3b37a77 Mon Sep 17 00:00:00 2001 From: Aodhan Collins Date: Fri, 13 Mar 2026 18:03:12 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20upgrade=20voice=20pipeline=20=E2=80=94?= =?UTF-8?q?=20MLX=20Whisper=20STT=20(20x=20faster),=20Qwen3.5=20MoE=20LLM,?= =?UTF-8?q?=20fix=20HA=20tool=20calling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) STT latency: 8.4s → 400ms for short voice commands - Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B) - Add model preload launchd service to pin voice model in VRAM permanently - Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH - Add pipeline benchmark script (STT/LLM/TTS latency profiling) - Add service restart buttons and STT endpoint to dashboard - Bind Vite dev server to 0.0.0.0 for LAN access Total estimated pipeline latency: ~27s → ~4s Co-Authored-By: Claude Opus 4.6 --- .env.example | 9 + TODO.md | 10 +- homeai-agent/openclaw-http-bridge.py | 70 ++++ homeai-character/src/ServiceStatus.jsx | 97 ++++- homeai-character/vite.config.js | 67 +++ .../launchd/com.homeai.preload-models.plist | 28 ++ .../modelfiles/Qwen3.5-35B-A3B.Modelfile | 55 +++ homeai-llm/scripts/preload-models.sh | 19 + homeai-voice/scripts/benchmark_pipeline.py | 381 ++++++++++++++++++ .../launchd/com.homeai.wyoming-stt.plist | 12 +- 10 files changed, 721 insertions(+), 27 deletions(-) create mode 100644 homeai-llm/launchd/com.homeai.preload-models.plist create mode 100644 homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile create mode 100755 homeai-llm/scripts/preload-models.sh create mode 100644 homeai-voice/scripts/benchmark_pipeline.py diff --git a/.env.example b/.env.example index 87eb9c2..c58eddb 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,14 @@ # Copy to .env and fill in your values. # .env is gitignored — never commit it. +# ─── API Keys ────────────────────────────────────────────────────────────────── +HUGGING_FACE_API_KEY= +OPENROUTER_API_KEY= +OPENAI_API_KEY= +DEEPSEEK_API_KEY= +GEMINI_API_KEY= +ELEVENLABS_API_KEY= + # ─── Data & Paths ────────────────────────────────────────────────────────────── DATA_DIR=${HOME}/homeai-data REPO_DIR=${HOME}/Projects/HomeAI @@ -45,3 +53,4 @@ VTUBE_WS_URL=ws://localhost:8001 # ─── P8: Images ──────────────────────────────────────────────────────────────── COMFYUI_URL=http://localhost:8188 + diff --git a/TODO.md b/TODO.md index 95a1f48..e255244 100644 --- a/TODO.md +++ b/TODO.md @@ -25,9 +25,11 @@ - [x] Write and load launchd plist (`com.homeai.ollama.plist`) — `/opt/homebrew/bin/ollama` - [x] Register local GGUF models via Modelfiles (no download): llama3.3:70b, qwen3:32b, codestral:22b, qwen2.5:7b - [x] Register additional models: EVA-LLaMA-3.33-70B, Midnight-Miqu-70B, QwQ-32B, Qwen3.5-35B, Qwen3-Coder-30B, Qwen3-VL-30B, GLM-4.6V-Flash, DeepSeek-R1-8B, gemma-3-27b +- [x] Add qwen3.5:35b-a3b (MoE, Q8_0) — 26.7 tok/s, recommended for voice pipeline +- [x] Write model preload script + launchd service (keeps voice model in VRAM permanently) - [x] Deploy Open WebUI via Docker compose (port 3030) - [x] Verify Open WebUI connected to Ollama, all models available -- [ ] Run `scripts/benchmark.sh` — record results in `benchmark-results.md` +- [x] Run pipeline benchmark (homeai-voice/scripts/benchmark_pipeline.py) — STT/LLM/TTS latency profiled - [ ] Add Ollama + Open WebUI to Uptime Kuma monitors --- @@ -37,6 +39,7 @@ ### P3 · homeai-voice - [x] Install `wyoming-faster-whisper` — model: faster-whisper-large-v3 (auto-downloaded) +- [x] Upgrade STT to wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) — 20x faster (8s → 400ms) - [x] Install Kokoro ONNX TTS — models at `~/models/kokoro/` - [x] Write Wyoming-Kokoro adapter server (`homeai-voice/tts/wyoming_kokoro_server.py`) - [x] Write + load launchd plists for Wyoming STT (10300) and TTS (10301) @@ -67,10 +70,11 @@ - [x] Fix context window: set `contextWindow=32768` for llama3.3:70b in `openclaw.json` - [x] Fix Llama 3.3 Modelfile: add tool-calling TEMPLATE block - [x] Verify `openclaw agent --message "..." --agent main` → completed -- [x] Write `skills/home-assistant` SKILL.md — HA REST API control +- [x] Write `skills/home-assistant` SKILL.md — HA REST API control via ha-ctl CLI - [x] Write `skills/voice-assistant` SKILL.md — voice response style guide - [x] Wire HASS_TOKEN — create `~/.homeai/hass_token` or set env in launchd plist -- [x] Test home-assistant skill: "turn on/off the reading lamp" +- [x] Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH, update TOOLS.md +- [x] Test home-assistant skill: "turn on/off the reading lamp" — verified exec→ha-ctl→HA action - [x] Set up mem0 with Chroma backend, test semantic recall - [x] Write memory backup launchd job - [x] Build morning briefing n8n workflow diff --git a/homeai-agent/openclaw-http-bridge.py b/homeai-agent/openclaw-http-bridge.py index 1d3e3b2..e3cfbe1 100644 --- a/homeai-agent/openclaw-http-bridge.py +++ b/homeai-agent/openclaw-http-bridge.py @@ -34,6 +34,7 @@ import wave import io from wyoming.client import AsyncTcpClient from wyoming.tts import Synthesize, SynthesizeVoice +from wyoming.asr import Transcribe, Transcript from wyoming.audio import AudioStart, AudioChunk, AudioStop from wyoming.info import Info @@ -79,6 +80,11 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler): self._handle_tts_request() return + # Handle STT requests + if parsed_path.path == "/api/stt": + self._handle_stt_request() + return + # Only handle the agent message endpoint if parsed_path.path == "/api/agent/message": self._handle_agent_request() @@ -170,6 +176,70 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler): return wav_io.getvalue() + def _handle_stt_request(self): + """Handle STT request — accept WAV audio, return transcribed text.""" + content_length = int(self.headers.get("Content-Length", 0)) + if content_length == 0: + self._send_json_response(400, {"error": "Empty body"}) + return + + try: + audio_bytes = self.rfile.read(content_length) + + # Parse WAV to get PCM data and format + wav_io = io.BytesIO(audio_bytes) + with wave.open(wav_io, 'rb') as wav_file: + rate = wav_file.getframerate() + width = wav_file.getsampwidth() + channels = wav_file.getnchannels() + pcm_data = wav_file.readframes(wav_file.getnframes()) + + # Run the async Wyoming client + text = asyncio.run(self._transcribe_audio(pcm_data, rate, width, channels)) + + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.end_headers() + self.wfile.write(json.dumps({"text": text}).encode()) + + except wave.Error as e: + self._send_json_response(400, {"error": f"Invalid WAV: {e}"}) + except Exception as e: + self._send_json_response(500, {"error": str(e)}) + + async def _transcribe_audio(self, pcm_data: bytes, rate: int, width: int, channels: int) -> str: + """Connect to Wyoming STT server and transcribe audio.""" + client = AsyncTcpClient("127.0.0.1", 10300) + await client.connect() + + # Send Transcribe request (STT server does not send an initial Info event) + await client.write_event(Transcribe(language="en").event()) + + # Send audio + await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event()) + + # Send in chunks (1 second each) + bytes_per_second = rate * width * channels + for offset in range(0, len(pcm_data), bytes_per_second): + chunk = pcm_data[offset:offset + bytes_per_second] + await client.write_event(AudioChunk(rate=rate, width=width, channels=channels, audio=chunk).event()) + + await client.write_event(AudioStop().event()) + + # Read transcript + while True: + event = await client.read_event() + if event is None: + break + if Transcript.is_type(event.type): + transcript = Transcript.from_event(event) + await client.disconnect() + return transcript.text + + await client.disconnect() + return "" + def _handle_wake_word(self): """Handle wake word detection notification.""" content_length = int(self.headers.get("Content-Length", 0)) diff --git a/homeai-character/src/ServiceStatus.jsx b/homeai-character/src/ServiceStatus.jsx index 0eaa861..6393204 100644 --- a/homeai-character/src/ServiceStatus.jsx +++ b/homeai-character/src/ServiceStatus.jsx @@ -8,6 +8,7 @@ const SERVICES = [ uiUrl: null, description: 'Local LLM runtime', category: 'AI & LLM', + restart: { type: 'launchd', id: 'gui/501/com.homeai.ollama' }, }, { name: 'Open WebUI', @@ -16,6 +17,7 @@ const SERVICES = [ uiUrl: 'http://localhost:3030', description: 'Chat interface', category: 'AI & LLM', + restart: { type: 'docker', id: 'homeai-open-webui' }, }, { name: 'OpenClaw Gateway', @@ -24,6 +26,7 @@ const SERVICES = [ uiUrl: null, description: 'Agent gateway', category: 'Agent', + restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw' }, }, { name: 'OpenClaw Bridge', @@ -32,6 +35,7 @@ const SERVICES = [ uiUrl: null, description: 'HTTP-to-CLI bridge', category: 'Agent', + restart: { type: 'launchd', id: 'gui/501/com.homeai.openclaw-bridge' }, }, { name: 'Wyoming STT', @@ -41,6 +45,7 @@ const SERVICES = [ description: 'Whisper speech-to-text', category: 'Voice', tcp: true, + restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-stt' }, }, { name: 'Wyoming TTS', @@ -50,6 +55,7 @@ const SERVICES = [ description: 'Kokoro text-to-speech', category: 'Voice', tcp: true, + restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-tts' }, }, { name: 'Wyoming Satellite', @@ -59,6 +65,16 @@ const SERVICES = [ description: 'Mac Mini mic/speaker satellite', category: 'Voice', tcp: true, + restart: { type: 'launchd', id: 'gui/501/com.homeai.wyoming-satellite' }, + }, + { + name: 'Character Dashboard', + url: 'http://localhost:5173', + healthPath: '/', + uiUrl: 'http://localhost:5173', + description: 'Character manager & service status', + category: 'Agent', + restart: { type: 'launchd', id: 'gui/501/com.homeai.character-dashboard' }, }, { name: 'Home Assistant', @@ -75,6 +91,7 @@ const SERVICES = [ uiUrl: 'http://localhost:3001', description: 'Service health monitoring', category: 'Infrastructure', + restart: { type: 'docker', id: 'homeai-uptime-kuma' }, }, { name: 'n8n', @@ -83,6 +100,7 @@ const SERVICES = [ uiUrl: 'http://localhost:5678', description: 'Workflow automation', category: 'Infrastructure', + restart: { type: 'docker', id: 'homeai-n8n' }, }, { name: 'code-server', @@ -91,6 +109,7 @@ const SERVICES = [ uiUrl: 'http://localhost:8090', description: 'Browser-based VS Code', category: 'Infrastructure', + restart: { type: 'docker', id: 'homeai-code-server' }, }, { name: 'Portainer', @@ -155,6 +174,7 @@ export default function ServiceStatus() { Object.fromEntries(SERVICES.map(s => [s.name, { status: 'checking', lastCheck: null, responseTime: null }])) ); const [lastRefresh, setLastRefresh] = useState(null); + const [restarting, setRestarting] = useState({}); const checkService = useCallback(async (service) => { try { @@ -208,6 +228,31 @@ export default function ServiceStatus() { return () => clearInterval(interval); }, [refreshAll]); + const restartService = useCallback(async (service) => { + if (!service.restart) return; + setRestarting(prev => ({ ...prev, [service.name]: true })); + try { + const res = await fetch('/api/service/restart', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(service.restart), + }); + const data = await res.json(); + if (!data.ok) { + console.error(`Restart failed for ${service.name}:`, data.error); + } + // Wait a moment for the service to come back, then re-check + setTimeout(async () => { + const result = await checkService(service); + setStatuses(prev => ({ ...prev, [service.name]: result })); + setRestarting(prev => ({ ...prev, [service.name]: false })); + }, 3000); + } catch (err) { + console.error(`Restart failed for ${service.name}:`, err); + setRestarting(prev => ({ ...prev, [service.name]: false })); + } + }, [checkService]); + const categories = [...new Set(SERVICES.map(s => s.category))]; const onlineCount = Object.values(statuses).filter(s => s.status === 'online').length; const offlineCount = Object.values(statuses).filter(s => s.status === 'offline').length; @@ -293,19 +338,45 @@ export default function ServiceStatus() {

{st.responseTime}ms

)} - {service.uiUrl && ( - - Open - - - - - )} +
+ {service.restart && st.status === 'offline' && ( + + )} + {service.uiUrl && ( + + Open + + + + + )} +
); diff --git a/homeai-character/vite.config.js b/homeai-character/vite.config.js index f56a79b..7b54513 100644 --- a/homeai-character/vite.config.js +++ b/homeai-character/vite.config.js @@ -53,6 +53,70 @@ function healthCheckPlugin() { res.end(JSON.stringify({ status: 'offline', responseTime: null })); } }); + // Service restart — runs launchctl or docker restart + server.middlewares.use('/api/service/restart', async (req, res) => { + if (req.method === 'OPTIONS') { + res.writeHead(204, { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'Content-Type' }); + res.end(); + return; + } + if (req.method !== 'POST') { + res.writeHead(405); + res.end(); + return; + } + try { + const chunks = []; + for await (const chunk of req) chunks.push(chunk); + const { type, id } = JSON.parse(Buffer.concat(chunks).toString()); + + if (!type || !id) { + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ ok: false, error: 'Missing type or id' })); + return; + } + + // Whitelist valid service IDs to prevent command injection + const ALLOWED_LAUNCHD = [ + 'gui/501/com.homeai.ollama', + 'gui/501/com.homeai.openclaw', + 'gui/501/com.homeai.openclaw-bridge', + 'gui/501/com.homeai.wyoming-stt', + 'gui/501/com.homeai.wyoming-tts', + 'gui/501/com.homeai.wyoming-satellite', + 'gui/501/com.homeai.character-dashboard', + ]; + const ALLOWED_DOCKER = [ + 'homeai-open-webui', + 'homeai-uptime-kuma', + 'homeai-n8n', + 'homeai-code-server', + ]; + + let cmd; + if (type === 'launchd' && ALLOWED_LAUNCHD.includes(id)) { + cmd = ['launchctl', 'kickstart', '-k', id]; + } else if (type === 'docker' && ALLOWED_DOCKER.includes(id)) { + cmd = ['docker', 'restart', id]; + } else { + res.writeHead(403, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ ok: false, error: 'Service not in allowed list' })); + return; + } + + const { execFile } = await import('child_process'); + const { promisify } = await import('util'); + const execFileAsync = promisify(execFile); + const { stdout, stderr } = await execFileAsync(cmd[0], cmd.slice(1), { timeout: 30000 }); + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ ok: true, stdout: stdout.trim(), stderr: stderr.trim() })); + } catch (err) { + res.writeHead(500, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ ok: false, error: err.message })); + } + }); + // TTS preview proxy — forwards POST to OpenClaw bridge, returns audio server.middlewares.use('/api/tts', async (req, res) => { if (req.method !== 'POST') { @@ -99,4 +163,7 @@ export default defineConfig({ tailwindcss(), react(), ], + server: { + host: '0.0.0.0', + }, }) diff --git a/homeai-llm/launchd/com.homeai.preload-models.plist b/homeai-llm/launchd/com.homeai.preload-models.plist new file mode 100644 index 0000000..e7b209e --- /dev/null +++ b/homeai-llm/launchd/com.homeai.preload-models.plist @@ -0,0 +1,28 @@ + + + + + Label + com.homeai.preload-models + + ProgramArguments + + /bin/bash + /Users/aodhan/gitea/homeai/homeai-llm/scripts/preload-models.sh + + + RunAtLoad + + + StandardOutPath + /tmp/homeai-preload-models.log + + StandardErrorPath + /tmp/homeai-preload-models-error.log + + + ThrottleInterval + 15 + + diff --git a/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile new file mode 100644 index 0000000..e53108c --- /dev/null +++ b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile @@ -0,0 +1,55 @@ +FROM /Users/aodhan/gitea/homeai/homeai-llm/modelfiles/lmstudio-community/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q8_0.gguf + +TEMPLATE """{{- if or .System .Tools }}<|im_start|>system +{{- if .System }} +{{ .System }} +{{- end }} +{{- if .Tools }} + +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{{- range .Tools }} +{"type": "function", "function": {"name": "{{ .Function.Name }}", "description": "{{ .Function.Description }}", "parameters": {{ .Function.Parameters }}}} +{{- end }} + + +For each function call, return a json object with function name and arguments within XML tags: + +{"name": , "arguments": } + +{{- end }}<|im_end|> +{{- end }} +{{- range $i, $_ := .Messages }} +{{- $last := eq (len (slice $.Messages $i)) 1 }} +{{- if eq .Role "user" }}<|im_start|>user +{{ .Content }}<|im_end|> +{{ else if eq .Role "assistant" }}<|im_start|>assistant +{{- if .ToolCalls }} +{{- range .ToolCalls }} + +{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} + +{{- end }} +{{- else }}{{ .Content }} +{{- end }}{{ if not $last }}<|im_end|> +{{ end }} +{{- else if eq .Role "tool" }}<|im_start|>user + +{{ .Content }} +<|im_end|> +{{ end }} +{{- end }}<|im_start|>assistant +""" + +SYSTEM You are a helpful AI assistant. +PARAMETER num_ctx 32768 +PARAMETER stop <|im_end|> +PARAMETER stop <|endoftext|> +PARAMETER temperature 0.6 +PARAMETER top_p 0.95 +PARAMETER presence_penalty 1.5 +PARAMETER top_k 20 diff --git a/homeai-llm/scripts/preload-models.sh b/homeai-llm/scripts/preload-models.sh new file mode 100755 index 0000000..85e0186 --- /dev/null +++ b/homeai-llm/scripts/preload-models.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Pre-load voice pipeline models into Ollama with infinite keep_alive. +# Run after Ollama starts (called by launchd or manually). +# Only pins lightweight/MoE models — large dense models (70B) use default expiry. + +OLLAMA_URL="http://localhost:11434" + +# Wait for Ollama to be ready +for i in $(seq 1 30); do + curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break + sleep 2 +done + +# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default) +echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..." +curl -sf "$OLLAMA_URL/api/generate" \ + -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \ + > /dev/null 2>&1 +echo "[preload] qwen3.5:35b-a3b pinned in memory" diff --git a/homeai-voice/scripts/benchmark_pipeline.py b/homeai-voice/scripts/benchmark_pipeline.py new file mode 100644 index 0000000..21296ca --- /dev/null +++ b/homeai-voice/scripts/benchmark_pipeline.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +""" +Voice Pipeline Benchmark + +Measures latency of each stage independently: + 1. STT (Wyoming Whisper, port 10300) + 2. LLM (Ollama API, port 11434) — multiple models + 3. TTS (Wyoming Kokoro, port 10301) + 4. End-to-end via OpenClaw HTTP Bridge (port 8081) + +Usage: + python benchmark_pipeline.py [--rounds 3] [--models llama3.3:70b,qwen3:32b] +""" + +import argparse +import asyncio +import io +import json +import statistics +import sys +import time +import wave +from urllib.request import Request, urlopen +from urllib.error import URLError + +from wyoming.audio import AudioChunk, AudioStart, AudioStop +from wyoming.asr import Transcribe, Transcript +from wyoming.client import AsyncTcpClient +from wyoming.tts import Synthesize, SynthesizeVoice + +# --- Config --- +STT_HOST, STT_PORT = "127.0.0.1", 10300 +TTS_HOST, TTS_PORT = "127.0.0.1", 10301 +OLLAMA_URL = "http://localhost:11434" +BRIDGE_URL = "http://localhost:8081" + +TEST_PROMPTS = [ + "What is the capital of France?", + "Turn on the living room lights.", + "What's the weather like today?", +] + +LONG_PROMPT = "Explain in two sentences how a heat pump works." + + +# --- Helpers --- + +def http_post_json(url: str, data: dict, timeout: int = 180) -> tuple[dict, float]: + """POST JSON, return (response_dict, elapsed_seconds).""" + body = json.dumps(data).encode() + req = Request(url, data=body, headers={"Content-Type": "application/json"}) + t0 = time.perf_counter() + resp = urlopen(req, timeout=timeout) + raw = resp.read() + elapsed = time.perf_counter() - t0 + return json.loads(raw), elapsed + + +def http_post_raw(url: str, data: bytes, content_type: str, timeout: int = 180) -> tuple[bytes, float]: + """POST raw bytes, return (response_bytes, elapsed_seconds).""" + req = Request(url, data=data, headers={"Content-Type": content_type}) + t0 = time.perf_counter() + resp = urlopen(req, timeout=timeout) + raw = resp.read() + elapsed = time.perf_counter() - t0 + return raw, elapsed + + +# --- Stage 1: TTS --- + +async def benchmark_tts(text: str) -> tuple[bytes, float]: + """Synthesize text via Wyoming TTS, return (wav_bytes, elapsed).""" + t0 = time.perf_counter() + client = AsyncTcpClient(TTS_HOST, TTS_PORT) + await client.connect() + await client.read_event() # Info + + await client.write_event( + Synthesize(text=text, voice=SynthesizeVoice(name="af_heart")).event() + ) + + audio_data = bytearray() + rate, width, channels = 24000, 2, 1 + + while True: + event = await client.read_event() + if event is None: + break + if AudioStart.is_type(event.type): + start = AudioStart.from_event(event) + rate, width, channels = start.rate, start.width, start.channels + elif AudioChunk.is_type(event.type): + audio_data.extend(AudioChunk.from_event(event).audio) + elif AudioStop.is_type(event.type): + break + + await client.disconnect() + elapsed = time.perf_counter() - t0 + + # Package as WAV + wav_io = io.BytesIO() + with wave.open(wav_io, "wb") as wf: + wf.setnchannels(channels) + wf.setsampwidth(width) + wf.setframerate(rate) + wf.writeframes(audio_data) + + duration_s = len(audio_data) / (rate * width * channels) + return wav_io.getvalue(), elapsed, duration_s + + +# --- Stage 2: STT --- + +async def benchmark_stt(wav_bytes: bytes) -> tuple[str, float]: + """Transcribe WAV via Wyoming STT, return (text, elapsed).""" + wav_io = io.BytesIO(wav_bytes) + with wave.open(wav_io, "rb") as wf: + rate = wf.getframerate() + width = wf.getsampwidth() + channels = wf.getnchannels() + pcm = wf.readframes(wf.getnframes()) + + t0 = time.perf_counter() + client = AsyncTcpClient(STT_HOST, STT_PORT) + await client.connect() + + await client.write_event(Transcribe(language="en").event()) + await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event()) + + chunk_size = rate * width * channels # 1 second + for off in range(0, len(pcm), chunk_size): + await client.write_event( + AudioChunk(rate=rate, width=width, channels=channels, audio=pcm[off:off + chunk_size]).event() + ) + await client.write_event(AudioStop().event()) + + text = "" + while True: + event = await client.read_event() + if event is None: + break + if Transcript.is_type(event.type): + text = Transcript.from_event(event).text + break + + await client.disconnect() + elapsed = time.perf_counter() - t0 + return text, elapsed + + +# --- Stage 3: LLM --- + +def benchmark_llm(model: str, prompt: str, warm: bool = False) -> dict: + """ + Call Ollama /api/generate, return timing breakdown. + If warm=True, we assume the model is already loaded. + """ + data = { + "model": model, + "prompt": prompt, + "stream": False, + "options": {"num_ctx": 2048}, # small ctx for benchmark speed + } + body = json.dumps(data).encode() + req = Request( + f"{OLLAMA_URL}/api/generate", + data=body, + headers={"Content-Type": "application/json"}, + ) + + t0 = time.perf_counter() + resp = urlopen(req, timeout=300) + raw = resp.read() + wall_time = time.perf_counter() - t0 + + result = json.loads(raw) + + # Ollama returns timing in nanoseconds + load_ns = result.get("load_duration", 0) + prompt_ns = result.get("prompt_eval_duration", 0) + eval_ns = result.get("eval_duration", 0) + total_ns = result.get("total_duration", 0) + prompt_tokens = result.get("prompt_eval_count", 0) + eval_tokens = result.get("eval_count", 0) + + return { + "model": model, + "wall_time_s": wall_time, + "load_s": load_ns / 1e9, + "prompt_eval_s": prompt_ns / 1e9, + "eval_s": eval_ns / 1e9, + "total_s": total_ns / 1e9, + "prompt_tokens": prompt_tokens, + "eval_tokens": eval_tokens, + "tokens_per_sec": eval_tokens / (eval_ns / 1e9) if eval_ns > 0 else 0, + "prompt_tokens_per_sec": prompt_tokens / (prompt_ns / 1e9) if prompt_ns > 0 else 0, + "response": result.get("response", "")[:200], + } + + +def warm_model(model: str): + """Send a tiny request to load the model into GPU memory.""" + print(f" Warming up {model}...", end=" ", flush=True) + try: + data = json.dumps({"model": model, "prompt": "hi", "stream": False, "options": {"num_ctx": 512}}).encode() + req = Request(f"{OLLAMA_URL}/api/generate", data=data, headers={"Content-Type": "application/json"}) + urlopen(req, timeout=300).read() + print("ready") + except Exception as e: + print(f"warning: {e}") + + +# --- Stage 4: End-to-end via bridge --- + +def benchmark_e2e(message: str) -> tuple[str, float]: + """Call the OpenClaw HTTP bridge end-to-end.""" + data = {"message": message, "agent": "main"} + resp, elapsed = http_post_json(f"{BRIDGE_URL}/api/agent/message", data, timeout=300) + return resp.get("response", ""), elapsed + + +# --- Formatting --- + +def fmt_time(seconds: float) -> str: + if seconds < 1: + return f"{seconds*1000:.0f}ms" + return f"{seconds:.1f}s" + + +def print_table(rows: list[dict], columns: list[tuple[str, str, int]]): + """Print a formatted table. columns = [(header, key, width), ...]""" + header = " | ".join(h.ljust(w) for h, _, w in columns) + print(header) + print("-" * len(header)) + for row in rows: + line = " | ".join(str(row.get(k, "")).ljust(w) for _, k, w in columns) + print(line) + + +# --- Main --- + +def main(): + parser = argparse.ArgumentParser(description="Voice Pipeline Benchmark") + parser.add_argument("--rounds", type=int, default=2, help="Rounds per test (default: 2)") + parser.add_argument( + "--models", + default="qwen2.5:7b,qwen3:32b,llama3.3:70b", + help="Comma-separated Ollama models to test", + ) + parser.add_argument("--skip-stt", action="store_true", help="Skip STT benchmark") + parser.add_argument("--skip-tts", action="store_true", help="Skip TTS benchmark") + parser.add_argument("--skip-llm", action="store_true", help="Skip LLM benchmark") + parser.add_argument("--skip-e2e", action="store_true", help="Skip end-to-end benchmark") + parser.add_argument("--prompt", default=None, help="Custom prompt for LLM benchmark") + args = parser.parse_args() + + models = [m.strip() for m in args.models.split(",")] + llm_prompt = args.prompt or LONG_PROMPT + + print("=" * 70) + print(" VOICE PIPELINE BENCHMARK") + print("=" * 70) + print(f" Rounds: {args.rounds}") + print(f" Models: {', '.join(models)}") + print(f" LLM prompt: {llm_prompt!r}") + print() + + # ── TTS Benchmark ── + test_wav = None + if not args.skip_tts: + print("── TTS (Kokoro, Wyoming port 10301) ──") + tts_times = [] + tts_durations = [] + for i in range(args.rounds): + text = TEST_PROMPTS[i % len(TEST_PROMPTS)] + wav, elapsed, audio_dur = asyncio.run(benchmark_tts(text)) + tts_times.append(elapsed) + tts_durations.append(audio_dur) + test_wav = wav + print(f" Round {i+1}: {fmt_time(elapsed)} → {audio_dur:.1f}s audio (RTF: {elapsed/audio_dur:.2f}x) text={text!r}") + + avg_tts = statistics.mean(tts_times) + avg_dur = statistics.mean(tts_durations) + print(f" Average: {fmt_time(avg_tts)} for {avg_dur:.1f}s audio (RTF: {avg_tts/avg_dur:.2f}x)") + print() + + # ── STT Benchmark ── + if not args.skip_stt: + print("── STT (Whisper large-v3, Wyoming port 10300) ──") + if test_wav is None: + # Generate a test WAV first + print(" Generating test audio via TTS...") + test_wav, _, _ = asyncio.run(benchmark_tts("The quick brown fox jumps over the lazy dog.")) + + stt_times = [] + for i in range(args.rounds): + text, elapsed = asyncio.run(benchmark_stt(test_wav)) + stt_times.append(elapsed) + print(f" Round {i+1}: {fmt_time(elapsed)} → {text!r}") + + print(f" Average: {fmt_time(statistics.mean(stt_times))}") + print() + + # ── LLM Benchmark ── + if not args.skip_llm: + print("── LLM (Ollama) ──") + print(f" Prompt: {llm_prompt!r}") + print() + + all_results = [] + for model in models: + print(f" Model: {model}") + warm_model(model) + + model_runs = [] + for i in range(args.rounds): + result = benchmark_llm(model, llm_prompt, warm=True) + model_runs.append(result) + print( + f" Round {i+1}: wall={fmt_time(result['wall_time_s'])} " + f"load={fmt_time(result['load_s'])} " + f"prompt_eval={fmt_time(result['prompt_eval_s'])} ({result['prompt_tokens']}tok, {result['prompt_tokens_per_sec']:.0f}t/s) " + f"gen={fmt_time(result['eval_s'])} ({result['eval_tokens']}tok, {result['tokens_per_sec']:.1f}t/s)" + ) + # Truncate response for display + resp_preview = result["response"][:100].replace("\n", " ") + print(f" → {resp_preview}") + + # Summarize + avg_wall = statistics.mean(r["wall_time_s"] for r in model_runs) + avg_tps = statistics.mean(r["tokens_per_sec"] for r in model_runs) + avg_prompt_tps = statistics.mean(r["prompt_tokens_per_sec"] for r in model_runs) + avg_tokens = statistics.mean(r["eval_tokens"] for r in model_runs) + all_results.append({ + "model": model, + "avg_wall": fmt_time(avg_wall), + "avg_gen_tps": f"{avg_tps:.1f}", + "avg_prompt_tps": f"{avg_prompt_tps:.0f}", + "avg_tokens": f"{avg_tokens:.0f}", + }) + print() + + # Summary table + print(" ┌─ LLM Summary ─────────────────────────────────────────────┐") + print(f" {'Model':<25s} {'Wall time':>10s} {'Gen t/s':>10s} {'Prompt t/s':>11s} {'Avg tokens':>11s}") + print(f" {'─'*25} {'─'*10} {'─'*10} {'─'*11} {'─'*11}") + for r in all_results: + print(f" {r['model']:<25s} {r['avg_wall']:>10s} {r['avg_gen_tps']:>10s} {r['avg_prompt_tps']:>11s} {r['avg_tokens']:>11s}") + print() + + # ── End-to-end ── + if not args.skip_e2e: + print("── End-to-End (Bridge → OpenClaw → Ollama → response) ──") + print(" (Does not include STT/TTS — just text in → text out via bridge)") + e2e_prompt = "What time is it?" + for i in range(args.rounds): + try: + resp, elapsed = benchmark_e2e(e2e_prompt) + preview = resp[:100].replace("\n", " ") + print(f" Round {i+1}: {fmt_time(elapsed)} → {preview}") + except Exception as e: + print(f" Round {i+1}: ERROR - {e}") + print() + + # ── Pipeline estimate ── + print("=" * 70) + print(" ESTIMATED PIPELINE LATENCY (per voice interaction)") + print("=" * 70) + print(" wake word detection ~instant (runs locally)") + print(" + STT (Whisper) see above") + print(" + LLM (inference) see above (dominant cost)") + print(" + TTS (Kokoro) see above") + print(" ─────────────────────────────────────") + print(" Tip: smaller models (7B, 32B) dramatically reduce LLM latency.") + print(" The 70B model at ~12 tok/s needs ~5-8s for a typical reply.") + print(" A 7B model at ~80 tok/s would need <1s for the same reply.") + print() + + +if __name__ == "__main__": + main() diff --git a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist index e7e59b1..94637f2 100644 --- a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist +++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist @@ -8,21 +8,11 @@ ProgramArguments - /Users/aodhan/homeai-voice-env/bin/wyoming-faster-whisper + /Users/aodhan/homeai-whisper-mlx-env/bin/wyoming-mlx-whisper --uri tcp://0.0.0.0:10300 - --model - large-v3 --language en - --device - cpu - --compute-type - int8 - --data-dir - /Users/aodhan/models/whisper - --download-dir - /Users/aodhan/models/whisper RunAtLoad