feat: complete voice pipeline — fix wake word crash, bridge timeout, HA conversation agent

- Fix Wyoming satellite crash on wake word: convert macOS .aiff chimes to .wav (Python wave module only reads RIFF format, not AIFF) - Fix OpenClaw HTTP bridge: increase subprocess timeout 30s → 120s, add SO_REUSEADDR - Fix HA conversation component: use HTTP agent (not CLI) since HA runs in Docker on a different machine; update default host to Mac Mini IP, timeout to 120s - Rewrite character manager as Vite+React app with schema validation - Add Wyoming satellite wake word command, ElevenLabs TTS server, wakeword monitor - Add Phase 5 development plan - Update TODO.md: mark voice pipeline and agent tasks complete Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 00:15:55 +00:00
parent 664bb6d275
commit 6db8ae4492
34 changed files with 4649 additions and 1083 deletions
--- a/homeai-voice/scripts/launchd/com.homeai.wakeword.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wakeword.plist
@@ -13,7 +13,7 @@
    <string>--wake-word</string>
    <string>hey_jarvis</string>
    <string>--notify-url</string>
-    <string>http://localhost:8080/wake</string>
+    <string>http://localhost:8081/wake</string>
  </array>

  <key>RunAtLoad</key>
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-elevenlabs.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-elevenlabs.plist
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>com.homeai.wyoming-elevenlabs</string>
+    <key>ProgramArguments</key>
+    <array>
+        <string>/Users/aodhan/homeai-voice-env/bin/python3</string>
+        <string>/Users/aodhan/gitea/homeai/homeai-voice/tts/wyoming_elevenlabs_server.py</string>
+        <string>--uri</string>
+        <string>tcp://0.0.0.0:10302</string>
+    </array>
+    <key>RunAtLoad</key>
+    <true/>
+    <key>KeepAlive</key>
+    <true/>
+    <key>StandardOutPath</key>
+    <string>/tmp/homeai-wyoming-elevenlabs.log</string>
+    <key>StandardErrorPath</key>
+    <string>/tmp/homeai-wyoming-elevenlabs.log</string>
+    <key>EnvironmentVariables</key>
+    <dict>
+        <key>PATH</key>
+        <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
+    </dict>
+</dict>
+</plist>
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-satellite.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-satellite.plist
@@ -18,9 +18,9 @@
    <string>--area</string>
    <string>Living Room</string>
    <string>--mic-command</string>
-    <string>rec -q -r 16000 -c 1 -b 16 -t raw -</string>
+    <string>/opt/homebrew/bin/rec -q -r 16000 -c 1 -b 16 -t raw -</string>
    <string>--snd-command</string>
-    <string>play -q -r 24000 -c 1 -b 16 -t raw -</string>
+    <string>/opt/homebrew/bin/play -q -t raw -r 24000 -c 1 -b 16 -e signed-integer -</string>
    <string>--mic-command-rate</string>
    <string>16000</string>
    <string>--mic-command-width</string>
@@ -33,10 +33,18 @@
    <string>2</string>
    <string>--snd-command-channels</string>
    <string>1</string>
+    <string>--wake-command</string>
+    <string>/Users/aodhan/homeai-voice-env/bin/python3 /Users/aodhan/gitea/homeai/homeai-voice/wyoming/wakeword_command.py --wake-word hey_jarvis --threshold 0.5</string>
+    <string>--wake-command-rate</string>
+    <string>16000</string>
+    <string>--wake-command-width</string>
+    <string>2</string>
+    <string>--wake-command-channels</string>
+    <string>1</string>
    <string>--awake-wav</string>
-    <string>/System/Library/Sounds/Glass.aiff</string>
+    <string>/Users/aodhan/homeai-data/sounds/awake.wav</string>
    <string>--done-wav</string>
-    <string>/System/Library/Sounds/Blow.aiff</string>
+    <string>/Users/aodhan/homeai-data/sounds/done.wav</string>
    <string>--no-zeroconf</string>
  </array>

--- a/homeai-voice/scripts/monitor-wakeword.sh
+++ b/homeai-voice/scripts/monitor-wakeword.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Monitor wake word detection in real-time
+
+echo "Monitoring wake word detection..."
+echo "Say 'Hey Jarvis' to test"
+echo "Press Ctrl+C to stop"
+echo ""
+
+# Watch both the wake word log and bridge log
+tail -f /tmp/homeai-wakeword-error.log /tmp/homeai-openclaw-bridge.log 2>/dev/null | grep -E "(Wake word detected|Listening|Failed to notify)"
--- a/homeai-voice/tts/wyoming_elevenlabs_server.py
+++ b/homeai-voice/tts/wyoming_elevenlabs_server.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""Wyoming TTS server backed by ElevenLabs.
+
+Usage:
+    python wyoming_elevenlabs_server.py --uri tcp://0.0.0.0:10302 --voice-id 21m00Tcm4TlvDq8ikWAM
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import wave
+import io
+from urllib import request, error
+
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.event import Event
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
+from wyoming.server import AsyncEventHandler, AsyncServer
+from wyoming.tts import Synthesize
+
+_LOGGER = logging.getLogger(__name__)
+
+SAMPLE_RATE = 24000
+SAMPLE_WIDTH = 2  # int16
+CHANNELS = 1
+CHUNK_SECONDS = 1  # stream in 1-second chunks
+
+
+class ElevenLabsEventHandler(AsyncEventHandler):
+    def __init__(self, default_voice_id: str, default_model: str, api_key: str, speed: float, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._default_voice_id = default_voice_id
+        self._default_model = default_model
+        self._api_key = api_key
+        self._speed = speed
+
+        # Send info immediately on connect
+        asyncio.ensure_future(self._send_info())
+
+    async def _send_info(self):
+        info = Info(
+            tts=[
+                TtsProgram(
+                    name="elevenlabs",
+                    description="ElevenLabs API TTS",
+                    attribution=Attribution(
+                        name="ElevenLabs",
+                        url="https://elevenlabs.io/",
+                    ),
+                    installed=True,
+                    version="1.0.0",
+                    voices=[
+                        TtsVoice(
+                            name=self._default_voice_id,
+                            description="ElevenLabs Voice",
+                            attribution=Attribution(name="elevenlabs", url=""),
+                            installed=True,
+                            languages=["en-us"],
+                            version="1.0",
+                            speakers=[TtsVoiceSpeaker(name=self._default_voice_id)],
+                        )
+                    ],
+                )
+            ]
+        )
+        await self.write_event(info.event())
+
+    async def handle_event(self, event: Event) -> bool:
+        if Synthesize.is_type(event.type):
+            synthesize = Synthesize.from_event(event)
+            text = synthesize.text
+            voice_id = self._default_voice_id
+
+            if synthesize.voice and synthesize.voice.name:
+                voice_id = synthesize.voice.name
+
+            _LOGGER.debug("Synthesizing %r with voice_id=%s model=%s", text, voice_id, self._default_model)
+
+            try:
+                loop = asyncio.get_event_loop()
+                audio_bytes = await loop.run_in_executor(
+                    None, lambda: self._call_elevenlabs_api(text, voice_id)
+                )
+
+                if audio_bytes is None:
+                    raise Exception("Failed to generate audio from ElevenLabs")
+
+                await self.write_event(
+                    AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
+                )
+
+                chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
+                for i in range(0, len(audio_bytes), chunk_size):
+                    await self.write_event(
+                        AudioChunk(
+                            rate=SAMPLE_RATE,
+                            width=SAMPLE_WIDTH,
+                            channels=CHANNELS,
+                            audio=audio_bytes[i : i + chunk_size],
+                        ).event()
+                    )
+
+                await self.write_event(AudioStop().event())
+                _LOGGER.info("Synthesized audio completed")
+
+            except Exception:
+                _LOGGER.exception("Synthesis error")
+                await self.write_event(AudioStop().event())
+
+        return True  # keep connection open
+        
+    def _call_elevenlabs_api(self, text: str, voice_id: str) -> bytes:
+        import json
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
+        
+        headers = {
+            "Accept": "audio/pcm",
+            "Content-Type": "application/json",
+            "xi-api-key": self._api_key
+        }
+        
+        data = {
+            "text": text,
+            "model_id": self._default_model,
+        }
+        
+        req = request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers, method='POST')
+        try:
+            with request.urlopen(req) as response:
+                if response.status == 200:
+                    return response.read()
+                else:
+                    _LOGGER.error(f"ElevenLabs API Error: {response.status}")
+                    return None
+        except error.HTTPError as e:
+            _LOGGER.error(f"ElevenLabs HTTP Error: {e.code} - {e.read().decode('utf-8')}")
+            return None
+        except Exception as e:
+            _LOGGER.error(f"ElevenLabs Request Error: {str(e)}")
+            return None
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--uri", default="tcp://0.0.0.0:10302")
+    parser.add_argument("--voice-id", default="21m00Tcm4TlvDq8ikWAM", help="Default ElevenLabs Voice ID")
+    parser.add_argument("--model", default="eleven_monolingual_v1", help="ElevenLabs Model ID")
+    parser.add_argument("--speed", type=float, default=1.0)
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.debug else logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+    
+    api_key = os.environ.get("ELEVENLABS_API_KEY")
+    if not api_key:
+        # Try to read from .env file directly if not exported in shell
+        try:
+            env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), '.env')
+            if os.path.exists(env_path):
+                with open(env_path, 'r') as f:
+                    for line in f:
+                        if line.startswith('ELEVENLABS_API_KEY='):
+                            api_key = line.split('=', 1)[1].strip()
+                            break
+        except Exception:
+            pass
+
+    if not api_key:
+        _LOGGER.warning("ELEVENLABS_API_KEY environment variable not set. API calls will fail.")
+
+    _LOGGER.info("Starting ElevenLabs Wyoming TTS on %s (voice-id=%s, model=%s)", args.uri, args.voice_id, args.model)
+
+    server = AsyncServer.from_uri(args.uri)
+
+    def handler_factory(reader, writer):
+        return ElevenLabsEventHandler(args.voice_id, args.model, api_key, args.speed, reader, writer)
+
+    await server.run(handler_factory)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/homeai-voice/wyoming/wakeword_command.py
+++ b/homeai-voice/wyoming/wakeword_command.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Wake word detection command for Wyoming Satellite.
+
+The satellite feeds raw 16kHz 16-bit mono audio via stdin.
+This script reads that audio, runs openWakeWord, and prints
+the wake word name to stdout when detected.
+
+Usage (called by wyoming-satellite --wake-command):
+    python wakeword_command.py [--wake-word hey_jarvis] [--threshold 0.5]
+"""
+
+import argparse
+import sys
+import numpy as np
+import logging
+
+_LOGGER = logging.getLogger(__name__)
+
+SAMPLE_RATE = 16000
+CHUNK_SIZE = 1280  # ~80ms at 16kHz — recommended by openWakeWord
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wake-word", default="hey_jarvis")
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--cooldown", type=float, default=3.0)
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.debug else logging.WARNING,
+        format="%(asctime)s %(levelname)s %(message)s",
+        stream=sys.stderr,
+    )
+
+    import openwakeword
+    from openwakeword.model import Model
+
+    oww = Model(
+        wakeword_models=[args.wake_word],
+        inference_framework="onnx",
+    )
+
+    import time
+    last_trigger = 0.0
+    bytes_per_chunk = CHUNK_SIZE * 2  # 16-bit = 2 bytes per sample
+
+    _LOGGER.debug("Wake word command ready, reading audio from stdin")
+
+    try:
+        while True:
+            raw = sys.stdin.buffer.read(bytes_per_chunk)
+            if not raw:
+                break
+            if len(raw) < bytes_per_chunk:
+                # Pad with zeros if short read
+                raw = raw + b'\x00' * (bytes_per_chunk - len(raw))
+
+            chunk = np.frombuffer(raw, dtype=np.int16)
+            oww.predict(chunk)
+
+            for ww, scores in oww.prediction_buffer.items():
+                score = scores[-1] if scores else 0.0
+                if score >= args.threshold:
+                    now = time.time()
+                    if now - last_trigger >= args.cooldown:
+                        last_trigger = now
+                        # Print wake word name to stdout — satellite reads this
+                        print(ww, flush=True)
+                        _LOGGER.debug("Wake word detected: %s (score=%.3f)", ww, score)
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+
+
+if __name__ == "__main__":
+    main()