feat: complete voice pipeline — fix wake word crash, bridge timeout, HA conversation agent

- Fix Wyoming satellite crash on wake word: convert macOS .aiff chimes to .wav
  (Python wave module only reads RIFF format, not AIFF)
- Fix OpenClaw HTTP bridge: increase subprocess timeout 30s → 120s, add SO_REUSEADDR
- Fix HA conversation component: use HTTP agent (not CLI) since HA runs in Docker
  on a different machine; update default host to Mac Mini IP, timeout to 120s
- Rewrite character manager as Vite+React app with schema validation
- Add Wyoming satellite wake word command, ElevenLabs TTS server, wakeword monitor
- Add Phase 5 development plan
- Update TODO.md: mark voice pipeline and agent tasks complete

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-11 00:15:55 +00:00
parent 664bb6d275
commit 6db8ae4492
34 changed files with 4649 additions and 1083 deletions

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""Wyoming TTS server backed by ElevenLabs.
Usage:
python wyoming_elevenlabs_server.py --uri tcp://0.0.0.0:10302 --voice-id 21m00Tcm4TlvDq8ikWAM
"""
import argparse
import asyncio
import logging
import os
import wave
import io
from urllib import request, error
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize
_LOGGER = logging.getLogger(__name__)
SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2 # int16
CHANNELS = 1
CHUNK_SECONDS = 1 # stream in 1-second chunks
class ElevenLabsEventHandler(AsyncEventHandler):
def __init__(self, default_voice_id: str, default_model: str, api_key: str, speed: float, *args, **kwargs):
super().__init__(*args, **kwargs)
self._default_voice_id = default_voice_id
self._default_model = default_model
self._api_key = api_key
self._speed = speed
# Send info immediately on connect
asyncio.ensure_future(self._send_info())
async def _send_info(self):
info = Info(
tts=[
TtsProgram(
name="elevenlabs",
description="ElevenLabs API TTS",
attribution=Attribution(
name="ElevenLabs",
url="https://elevenlabs.io/",
),
installed=True,
version="1.0.0",
voices=[
TtsVoice(
name=self._default_voice_id,
description="ElevenLabs Voice",
attribution=Attribution(name="elevenlabs", url=""),
installed=True,
languages=["en-us"],
version="1.0",
speakers=[TtsVoiceSpeaker(name=self._default_voice_id)],
)
],
)
]
)
await self.write_event(info.event())
async def handle_event(self, event: Event) -> bool:
if Synthesize.is_type(event.type):
synthesize = Synthesize.from_event(event)
text = synthesize.text
voice_id = self._default_voice_id
if synthesize.voice and synthesize.voice.name:
voice_id = synthesize.voice.name
_LOGGER.debug("Synthesizing %r with voice_id=%s model=%s", text, voice_id, self._default_model)
try:
loop = asyncio.get_event_loop()
audio_bytes = await loop.run_in_executor(
None, lambda: self._call_elevenlabs_api(text, voice_id)
)
if audio_bytes is None:
raise Exception("Failed to generate audio from ElevenLabs")
await self.write_event(
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
)
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
for i in range(0, len(audio_bytes), chunk_size):
await self.write_event(
AudioChunk(
rate=SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=CHANNELS,
audio=audio_bytes[i : i + chunk_size],
).event()
)
await self.write_event(AudioStop().event())
_LOGGER.info("Synthesized audio completed")
except Exception:
_LOGGER.exception("Synthesis error")
await self.write_event(AudioStop().event())
return True # keep connection open
def _call_elevenlabs_api(self, text: str, voice_id: str) -> bytes:
import json
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
headers = {
"Accept": "audio/pcm",
"Content-Type": "application/json",
"xi-api-key": self._api_key
}
data = {
"text": text,
"model_id": self._default_model,
}
req = request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers, method='POST')
try:
with request.urlopen(req) as response:
if response.status == 200:
return response.read()
else:
_LOGGER.error(f"ElevenLabs API Error: {response.status}")
return None
except error.HTTPError as e:
_LOGGER.error(f"ElevenLabs HTTP Error: {e.code} - {e.read().decode('utf-8')}")
return None
except Exception as e:
_LOGGER.error(f"ElevenLabs Request Error: {str(e)}")
return None
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--uri", default="tcp://0.0.0.0:10302")
parser.add_argument("--voice-id", default="21m00Tcm4TlvDq8ikWAM", help="Default ElevenLabs Voice ID")
parser.add_argument("--model", default="eleven_monolingual_v1", help="ElevenLabs Model ID")
parser.add_argument("--speed", type=float, default=1.0)
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
api_key = os.environ.get("ELEVENLABS_API_KEY")
if not api_key:
# Try to read from .env file directly if not exported in shell
try:
env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), '.env')
if os.path.exists(env_path):
with open(env_path, 'r') as f:
for line in f:
if line.startswith('ELEVENLABS_API_KEY='):
api_key = line.split('=', 1)[1].strip()
break
except Exception:
pass
if not api_key:
_LOGGER.warning("ELEVENLABS_API_KEY environment variable not set. API calls will fail.")
_LOGGER.info("Starting ElevenLabs Wyoming TTS on %s (voice-id=%s, model=%s)", args.uri, args.voice_id, args.model)
server = AsyncServer.from_uri(args.uri)
def handler_factory(reader, writer):
return ElevenLabsEventHandler(args.voice_id, args.model, api_key, args.speed, reader, writer)
await server.run(handler_factory)
if __name__ == "__main__":
asyncio.run(main())