feat: upgrade voice pipeline — MLX Whisper STT (20x faster), Qwen3.5 MoE LLM, fix HA tool calling

- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) STT latency: 8.4s → 400ms for short voice commands - Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B) - Add model preload launchd service to pin voice model in VRAM permanently - Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH - Add pipeline benchmark script (STT/LLM/TTS latency profiling) - Add service restart buttons and STT endpoint to dashboard - Bind Vite dev server to 0.0.0.0 for LAN access Total estimated pipeline latency: ~27s → ~4s Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 18:03:12 +00:00
parent 1bfd7fbd08
commit af6b7bd945
10 changed files with 721 additions and 27 deletions
--- a/homeai-llm/launchd/com.homeai.preload-models.plist
+++ b/homeai-llm/launchd/com.homeai.preload-models.plist
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
+  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key>
+  <string>com.homeai.preload-models</string>
+
+  <key>ProgramArguments</key>
+  <array>
+    <string>/bin/bash</string>
+    <string>/Users/aodhan/gitea/homeai/homeai-llm/scripts/preload-models.sh</string>
+  </array>
+
+  <key>RunAtLoad</key>
+  <true/>
+
+  <key>StandardOutPath</key>
+  <string>/tmp/homeai-preload-models.log</string>
+
+  <key>StandardErrorPath</key>
+  <string>/tmp/homeai-preload-models-error.log</string>
+
+  <!-- Delay 15s to let Ollama start first -->
+  <key>ThrottleInterval</key>
+  <integer>15</integer>
+</dict>
+</plist>
--- a/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
+++ b/homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
@@ -0,0 +1,55 @@
+FROM /Users/aodhan/gitea/homeai/homeai-llm/modelfiles/lmstudio-community/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q8_0.gguf
+
+TEMPLATE """{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {"name": "{{ .Function.Name }}", "description": "{{ .Function.Description }}", "parameters": {{ .Function.Parameters }}}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{- end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{- if .ToolCalls }}
+{{- range .ToolCalls }}
+<tool_call>
+{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+</tool_call>
+{{- end }}
+{{- else }}{{ .Content }}
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- end }}<|im_start|>assistant
+"""
+
+SYSTEM You are a helpful AI assistant.
+PARAMETER num_ctx 32768
+PARAMETER stop <|im_end|>
+PARAMETER stop <|endoftext|>
+PARAMETER temperature 0.6
+PARAMETER top_p 0.95
+PARAMETER presence_penalty 1.5
+PARAMETER top_k 20
--- a/homeai-llm/scripts/preload-models.sh
+++ b/homeai-llm/scripts/preload-models.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Pre-load voice pipeline models into Ollama with infinite keep_alive.
+# Run after Ollama starts (called by launchd or manually).
+# Only pins lightweight/MoE models — large dense models (70B) use default expiry.
+
+OLLAMA_URL="http://localhost:11434"
+
+# Wait for Ollama to be ready
+for i in $(seq 1 30); do
+    curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
+    sleep 2
+done
+
+# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
+echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
+curl -sf "$OLLAMA_URL/api/generate" \
+    -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
+    > /dev/null 2>&1
+echo "[preload] qwen3.5:35b-a3b pinned in memory"