feat: upgrade voice pipeline — MLX Whisper STT (20x faster), Qwen3.5 MoE LLM, fix HA tool calling
- Replace faster-whisper with wyoming-mlx-whisper (whisper-large-v3-turbo, MLX Metal GPU) STT latency: 8.4s → 400ms for short voice commands - Add Qwen3.5-35B-A3B (MoE, 3B active params, Q8_0) to Ollama — 26.7 tok/s vs 5.4 tok/s (70B) - Add model preload launchd service to pin voice model in VRAM permanently - Fix HA tool calling: set commands.native=true, symlink ha-ctl to PATH - Add pipeline benchmark script (STT/LLM/TTS latency profiling) - Add service restart buttons and STT endpoint to dashboard - Bind Vite dev server to 0.0.0.0 for LAN access Total estimated pipeline latency: ~27s → ~4s Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
28
homeai-llm/launchd/com.homeai.preload-models.plist
Normal file
28
homeai-llm/launchd/com.homeai.preload-models.plist
Normal file
@@ -0,0 +1,28 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
|
||||
"http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.homeai.preload-models</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/Users/aodhan/gitea/homeai/homeai-llm/scripts/preload-models.sh</string>
|
||||
</array>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/homeai-preload-models.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/homeai-preload-models-error.log</string>
|
||||
|
||||
<!-- Delay 15s to let Ollama start first -->
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>15</integer>
|
||||
</dict>
|
||||
</plist>
|
||||
55
homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
Normal file
55
homeai-llm/modelfiles/Qwen3.5-35B-A3B.Modelfile
Normal file
@@ -0,0 +1,55 @@
|
||||
FROM /Users/aodhan/gitea/homeai/homeai-llm/modelfiles/lmstudio-community/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q8_0.gguf
|
||||
|
||||
TEMPLATE """{{- if or .System .Tools }}<|im_start|>system
|
||||
{{- if .System }}
|
||||
{{ .System }}
|
||||
{{- end }}
|
||||
{{- if .Tools }}
|
||||
|
||||
# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{{- range .Tools }}
|
||||
{"type": "function", "function": {"name": "{{ .Function.Name }}", "description": "{{ .Function.Description }}", "parameters": {{ .Function.Parameters }}}}
|
||||
{{- end }}
|
||||
</tools>
|
||||
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call>
|
||||
{{- end }}<|im_end|>
|
||||
{{- end }}
|
||||
{{- range $i, $_ := .Messages }}
|
||||
{{- $last := eq (len (slice $.Messages $i)) 1 }}
|
||||
{{- if eq .Role "user" }}<|im_start|>user
|
||||
{{ .Content }}<|im_end|>
|
||||
{{ else if eq .Role "assistant" }}<|im_start|>assistant
|
||||
{{- if .ToolCalls }}
|
||||
{{- range .ToolCalls }}
|
||||
<tool_call>
|
||||
{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
|
||||
</tool_call>
|
||||
{{- end }}
|
||||
{{- else }}{{ .Content }}
|
||||
{{- end }}{{ if not $last }}<|im_end|>
|
||||
{{ end }}
|
||||
{{- else if eq .Role "tool" }}<|im_start|>user
|
||||
<tool_response>
|
||||
{{ .Content }}
|
||||
</tool_response><|im_end|>
|
||||
{{ end }}
|
||||
{{- end }}<|im_start|>assistant
|
||||
"""
|
||||
|
||||
SYSTEM You are a helpful AI assistant.
|
||||
PARAMETER num_ctx 32768
|
||||
PARAMETER stop <|im_end|>
|
||||
PARAMETER stop <|endoftext|>
|
||||
PARAMETER temperature 0.6
|
||||
PARAMETER top_p 0.95
|
||||
PARAMETER presence_penalty 1.5
|
||||
PARAMETER top_k 20
|
||||
19
homeai-llm/scripts/preload-models.sh
Executable file
19
homeai-llm/scripts/preload-models.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Pre-load voice pipeline models into Ollama with infinite keep_alive.
|
||||
# Run after Ollama starts (called by launchd or manually).
|
||||
# Only pins lightweight/MoE models — large dense models (70B) use default expiry.
|
||||
|
||||
OLLAMA_URL="http://localhost:11434"
|
||||
|
||||
# Wait for Ollama to be ready
|
||||
for i in $(seq 1 30); do
|
||||
curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
|
||||
echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
|
||||
curl -sf "$OLLAMA_URL/api/generate" \
|
||||
-d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
|
||||
> /dev/null 2>&1
|
||||
echo "[preload] qwen3.5:35b-a3b pinned in memory"
|
||||
Reference in New Issue
Block a user