Complete P2 (LLM) and P3 (voice pipeline) implementation

P2 — homeai-llm: - Fix ollama launchd plist path for Apple Silicon (/opt/homebrew/bin/ollama) - Add Modelfiles for local GGUF models: llama3.3:70b, qwen3:32b, codestral:22b (registered via `ollama create` — no re-download needed) P3 — homeai-voice: - Wyoming STT: wyoming-faster-whisper, large-v3 model, port 10300 - Wyoming TTS: custom Kokoro ONNX server (wyoming_kokoro_server.py), port 10301 Voice af_heart; models at ~/models/kokoro/ - Wake word: openWakeWord daemon (hey_jarvis), notifies OpenClaw at /wake - launchd plists for all three services + load-all-launchd.sh helper - Smoke test: wyoming/test-pipeline.sh — 3/3 passing HA Wyoming integration pending manual UI config (STT 10.0.0.200:10300, TTS 10.0.0.200:10301). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-04 23:28:22 +00:00
parent 858d7be33c
commit c31724c92b
12 changed files with 534 additions and 27 deletions
--- a/homeai-voice/scripts/launchd/com.homeai.wakeword.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wakeword.plist
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
+  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key>
+  <string>com.homeai.wakeword</string>
+
+  <key>ProgramArguments</key>
+  <array>
+    <string>/Users/aodhan/homeai-voice-env/bin/python3</string>
+    <string>/Users/aodhan/gitea/homeai/homeai-voice/wyoming/wakeword_daemon.py</string>
+    <string>--wake-word</string>
+    <string>hey_jarvis</string>
+    <string>--notify-url</string>
+    <string>http://localhost:8080/wake</string>
+  </array>
+
+  <key>RunAtLoad</key>
+  <true/>
+
+  <key>KeepAlive</key>
+  <true/>
+
+  <key>StandardOutPath</key>
+  <string>/tmp/homeai-wakeword.log</string>
+
+  <key>StandardErrorPath</key>
+  <string>/tmp/homeai-wakeword-error.log</string>
+
+  <key>ThrottleInterval</key>
+  <integer>10</integer>
+</dict>
+</plist>
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-stt.plist
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
+  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key>
+  <string>com.homeai.wyoming-stt</string>
+
+  <key>ProgramArguments</key>
+  <array>
+    <string>/Users/aodhan/homeai-voice-env/bin/wyoming-faster-whisper</string>
+    <string>--uri</string>
+    <string>tcp://0.0.0.0:10300</string>
+    <string>--model</string>
+    <string>large-v3</string>
+    <string>--language</string>
+    <string>en</string>
+    <string>--device</string>
+    <string>cpu</string>
+    <string>--compute-type</string>
+    <string>int8</string>
+    <string>--data-dir</string>
+    <string>/Users/aodhan/models/whisper</string>
+    <string>--download-dir</string>
+    <string>/Users/aodhan/models/whisper</string>
+  </array>
+
+  <key>RunAtLoad</key>
+  <true/>
+
+  <key>KeepAlive</key>
+  <true/>
+
+  <key>StandardOutPath</key>
+  <string>/tmp/homeai-wyoming-stt.log</string>
+
+  <key>StandardErrorPath</key>
+  <string>/tmp/homeai-wyoming-stt-error.log</string>
+
+  <key>ThrottleInterval</key>
+  <integer>10</integer>
+</dict>
+</plist>
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-tts.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-tts.plist
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
+  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>Label</key>
+  <string>com.homeai.wyoming-tts</string>
+
+  <key>ProgramArguments</key>
+  <array>
+    <string>/Users/aodhan/homeai-voice-env/bin/python3</string>
+    <string>/Users/aodhan/gitea/homeai/homeai-voice/tts/wyoming_kokoro_server.py</string>
+    <string>--uri</string>
+    <string>tcp://0.0.0.0:10301</string>
+    <string>--voice</string>
+    <string>af_heart</string>
+    <string>--speed</string>
+    <string>1.0</string>
+  </array>
+
+  <key>RunAtLoad</key>
+  <true/>
+
+  <key>KeepAlive</key>
+  <true/>
+
+  <key>StandardOutPath</key>
+  <string>/tmp/homeai-wyoming-tts.log</string>
+
+  <key>StandardErrorPath</key>
+  <string>/tmp/homeai-wyoming-tts-error.log</string>
+
+  <key>ThrottleInterval</key>
+  <integer>10</integer>
+</dict>
+</plist>
--- a/homeai-voice/scripts/load-all-launchd.sh
+++ b/homeai-voice/scripts/load-all-launchd.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Load (or reload) all homeai-voice launchd services.
+
+set -euo pipefail
+
+LAUNCHD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/launchd" && pwd)"
+LAUNCH_AGENTS=~/Library/LaunchAgents
+
+PLISTS=(
+    com.homeai.wyoming-stt.plist
+    com.homeai.wyoming-tts.plist
+    com.homeai.wakeword.plist
+)
+
+for plist in "${PLISTS[@]}"; do
+    src="${LAUNCHD_DIR}/${plist}"
+    dst="${LAUNCH_AGENTS}/${plist}"
+    label="${plist%.plist}"
+
+    cp "$src" "$dst"
+
+    if launchctl list "$label" &>/dev/null; then
+        launchctl unload "$dst" 2>/dev/null || true
+    fi
+    launchctl load "$dst"
+    echo "Loaded: $label"
+done
+
+echo ""
+echo "Status:"
+for plist in "${PLISTS[@]}"; do
+    label="${plist%.plist}"
+    pid=$(launchctl list "$label" 2>/dev/null | awk 'NR==2{print $1}')
+    echo "  $label — PID: ${pid:--}"
+done
--- a/homeai-voice/tts/wyoming_kokoro_server.py
+++ b/homeai-voice/tts/wyoming_kokoro_server.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""Wyoming TTS server backed by Kokoro ONNX.
+
+Usage:
+    python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+
+import numpy as np
+
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.event import Event
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
+from wyoming.server import AsyncEventHandler, AsyncServer
+from wyoming.tts import Synthesize
+
+_LOGGER = logging.getLogger(__name__)
+
+SAMPLE_RATE = 24000
+SAMPLE_WIDTH = 2  # int16
+CHANNELS = 1
+CHUNK_SECONDS = 1  # stream in 1-second chunks
+
+
+def _load_kokoro():
+    from kokoro_onnx import Kokoro
+    model_dir = os.path.expanduser("~/models/kokoro")
+    return Kokoro(
+        os.path.join(model_dir, "kokoro-v1.0.onnx"),
+        os.path.join(model_dir, "voices-v1.0.bin"),
+    )
+
+
+class KokoroEventHandler(AsyncEventHandler):
+    def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tts = tts
+        self._default_voice = default_voice
+        self._speed = speed
+
+        # Send info immediately on connect
+        asyncio.ensure_future(self._send_info())
+
+    async def _send_info(self):
+        info = Info(
+            tts=[
+                TtsProgram(
+                    name="kokoro",
+                    description="Kokoro ONNX TTS",
+                    attribution=Attribution(
+                        name="thewh1teagle/kokoro-onnx",
+                        url="https://github.com/thewh1teagle/kokoro-onnx",
+                    ),
+                    installed=True,
+                    voices=[
+                        TtsVoice(
+                            name=self._default_voice,
+                            description="Kokoro voice",
+                            attribution=Attribution(name="kokoro", url=""),
+                            installed=True,
+                            languages=["en-us"],
+                            speakers=[TtsVoiceSpeaker(name=self._default_voice)],
+                        )
+                    ],
+                )
+            ]
+        )
+        await self.write_event(info.event())
+
+    async def handle_event(self, event: Event) -> bool:
+        if Synthesize.is_type(event.type):
+            synthesize = Synthesize.from_event(event)
+            text = synthesize.text
+            voice = self._default_voice
+
+            if synthesize.voice and synthesize.voice.name:
+                voice = synthesize.voice.name
+
+            _LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
+
+            try:
+                loop = asyncio.get_event_loop()
+                samples, sample_rate = await loop.run_in_executor(
+                    None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
+                )
+
+                samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
+                audio_bytes = samples_int16.tobytes()
+
+                await self.write_event(
+                    AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
+                )
+
+                chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
+                for i in range(0, len(audio_bytes), chunk_size):
+                    await self.write_event(
+                        AudioChunk(
+                            rate=SAMPLE_RATE,
+                            width=SAMPLE_WIDTH,
+                            channels=CHANNELS,
+                            audio=audio_bytes[i : i + chunk_size],
+                        ).event()
+                    )
+
+                await self.write_event(AudioStop().event())
+                _LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
+
+            except Exception:
+                _LOGGER.exception("Synthesis error")
+                await self.write_event(AudioStop().event())
+
+        return True  # keep connection open
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
+    parser.add_argument("--voice", default="af_heart")
+    parser.add_argument("--speed", type=float, default=1.0)
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.debug else logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+
+    _LOGGER.info("Loading Kokoro ONNX model...")
+    tts = _load_kokoro()
+    _LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)
+
+    server = AsyncServer.from_uri(args.uri)
+
+    def handler_factory(reader, writer):
+        return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)
+
+    await server.run(handler_factory)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/homeai-voice/wyoming/test-pipeline.sh
+++ b/homeai-voice/wyoming/test-pipeline.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Smoke test for the Wyoming voice pipeline.
+# Tests: STT server alive | TTS server alive | TTS generates audio
+#
+# Usage: ./test-pipeline.sh
+
+set -euo pipefail
+
+STT_HOST="${STT_HOST:-localhost}"
+STT_PORT="${STT_PORT:-10300}"
+TTS_HOST="${TTS_HOST:-localhost}"
+TTS_PORT="${TTS_PORT:-10301}"
+VENV="${VENV:-$HOME/homeai-voice-env}"
+
+PASS=0
+FAIL=0
+
+check() {
+    local name="$1"; local result="$2"
+    if [[ "$result" == ok* ]]; then
+        echo "  [PASS] $name${result#ok}"; PASS=$((PASS + 1))
+    else
+        echo "  [FAIL] $name — $result"; FAIL=$((FAIL + 1))
+    fi
+}
+
+echo "=== HomeAI Voice Pipeline Smoke Test ==="
+echo ""
+
+echo "1. STT Wyoming server (port $STT_PORT)"
+if nc -z -w2 "$STT_HOST" "$STT_PORT" 2>/dev/null; then
+    check "STT port open" "ok"
+else
+    check "STT port open" "port $STT_PORT not reachable — is wyoming-stt running?"
+fi
+
+echo ""
+echo "2. TTS Wyoming server (port $TTS_PORT)"
+if nc -z -w2 "$TTS_HOST" "$TTS_PORT" 2>/dev/null; then
+    check "TTS port open" "ok"
+else
+    check "TTS port open" "port $TTS_PORT not reachable — is wyoming-tts running?"
+fi
+
+echo ""
+echo "3. Kokoro TTS synthesis test"
+TTS_OUTPUT="/tmp/homeai-tts-test.wav"
+"$VENV/bin/python3" - <<'PYEOF'
+import sys, os, asyncio
+import numpy as np
+
+model_dir = os.path.expanduser("~/models/kokoro")
+model_path = os.path.join(model_dir, "kokoro-v1.0.onnx")
+voices_path = os.path.join(model_dir, "voices-v1.0.bin")
+
+if not os.path.exists(model_path):
+    print(f"Model not found: {model_path}")
+    sys.exit(1)
+
+from kokoro_onnx import Kokoro
+tts = Kokoro(model_path, voices_path)
+samples, sr = tts.create("Hello, I am your home assistant. The voice pipeline is working.", voice="af_heart", speed=1.0)
+
+# Write WAV
+import wave, struct
+samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
+with wave.open("/tmp/homeai-tts-test.wav", "w") as wf:
+    wf.setnchannels(1)
+    wf.setsampwidth(2)
+    wf.setframerate(sr)
+    wf.writeframes(samples_int16.tobytes())
+
+print(f"Generated {len(samples)/sr:.1f}s of audio at {sr}Hz")
+sys.exit(0)
+PYEOF
+if [[ $? -eq 0 && -f "$TTS_OUTPUT" ]]; then
+    size=$(wc -c < "$TTS_OUTPUT")
+    check "Kokoro synthesis" "ok — ${size} bytes written to $TTS_OUTPUT"
+    echo ""
+    echo "  To play: afplay $TTS_OUTPUT"
+else
+    check "Kokoro synthesis" "synthesis failed"
+fi
+
+echo ""
+echo "─────────────────────────────────"
+echo "Results: $PASS passed, $FAIL failed"
+[[ $FAIL -eq 0 ]]
--- a/homeai-voice/wyoming/wakeword_daemon.py
+++ b/homeai-voice/wyoming/wakeword_daemon.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""Always-on wake word detection daemon using openWakeWord.
+
+Listens on the default microphone, fires an HTTP POST to --notify-url
+when the wake word is detected.
+
+Usage:
+    python wakeword_daemon.py --wake-word hey_jarvis --notify-url http://localhost:8080/wake
+"""
+
+import argparse
+import logging
+import time
+import urllib.request
+import json
+import numpy as np
+
+_LOGGER = logging.getLogger(__name__)
+
+SAMPLE_RATE = 16000
+CHUNK_SIZE = 1280  # ~80ms at 16kHz — recommended by openWakeWord
+
+
+def notify(url: str, wake_word: str, score: float):
+    payload = json.dumps({"wake_word": wake_word, "score": float(score)}).encode()
+    try:
+        req = urllib.request.Request(
+            url,
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=2):
+            pass
+        _LOGGER.info("Wake word '%s' detected (score=%.3f) — notified %s", wake_word, score, url)
+    except Exception as e:
+        _LOGGER.warning("Failed to notify %s: %s", url, e)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wake-word", default="hey_jarvis")
+    parser.add_argument("--notify-url", default="http://localhost:8080/wake")
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--cooldown", type=float, default=3.0, help="Seconds between triggers")
+    parser.add_argument("--model-dir", default=None, help="Path to custom .onnx wake word model")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.debug else logging.INFO,
+        format="%(asctime)s %(levelname)s %(message)s",
+    )
+
+    try:
+        import pyaudio
+    except ImportError:
+        _LOGGER.error("pyaudio not installed. Run: pip install pyaudio")
+        raise SystemExit(1)
+
+    import openwakeword
+    from openwakeword.model import Model
+
+    _LOGGER.info("Loading wake word model: %s", args.wake_word)
+
+    model_paths = []
+    if args.model_dir:
+        import os, glob
+        model_paths = glob.glob(os.path.join(args.model_dir, "*.onnx"))
+
+    oww = Model(
+        wakeword_models=model_paths if model_paths else [args.wake_word],
+        inference_framework="onnx",
+    )
+
+    audio = pyaudio.PyAudio()
+    stream = audio.open(
+        rate=SAMPLE_RATE,
+        channels=1,
+        format=pyaudio.paInt16,
+        input=True,
+        frames_per_buffer=CHUNK_SIZE,
+    )
+
+    _LOGGER.info("Listening for wake word '%s'...", args.wake_word)
+    last_trigger = 0.0
+
+    try:
+        while True:
+            raw = stream.read(CHUNK_SIZE, exception_on_overflow=False)
+            chunk = np.frombuffer(raw, dtype=np.int16)
+            oww.predict(chunk)
+
+            for ww, scores in oww.prediction_buffer.items():
+                score = scores[-1] if scores else 0.0
+                if score >= args.threshold:
+                    now = time.time()
+                    if now - last_trigger >= args.cooldown:
+                        last_trigger = now
+                        notify(args.notify_url, ww, score)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        stream.stop_stream()
+        stream.close()
+        audio.terminate()
+
+
+if __name__ == "__main__":
+    main()