feat: complete voice pipeline — fix wake word crash, bridge timeout, HA conversation agent
- Fix Wyoming satellite crash on wake word: convert macOS .aiff chimes to .wav (Python wave module only reads RIFF format, not AIFF) - Fix OpenClaw HTTP bridge: increase subprocess timeout 30s → 120s, add SO_REUSEADDR - Fix HA conversation component: use HTTP agent (not CLI) since HA runs in Docker on a different machine; update default host to Mac Mini IP, timeout to 120s - Rewrite character manager as Vite+React app with schema validation - Add Wyoming satellite wake word command, ElevenLabs TTS server, wakeword monitor - Add Phase 5 development plan - Update TODO.md: mark voice pipeline and agent tasks complete Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
186
homeai-voice/tts/wyoming_elevenlabs_server.py
Normal file
186
homeai-voice/tts/wyoming_elevenlabs_server.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Wyoming TTS server backed by ElevenLabs.
|
||||
|
||||
Usage:
|
||||
python wyoming_elevenlabs_server.py --uri tcp://0.0.0.0:10302 --voice-id 21m00Tcm4TlvDq8ikWAM
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import wave
|
||||
import io
|
||||
from urllib import request, error
|
||||
|
||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||
from wyoming.event import Event
|
||||
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
|
||||
from wyoming.server import AsyncEventHandler, AsyncServer
|
||||
from wyoming.tts import Synthesize
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
SAMPLE_RATE = 24000
|
||||
SAMPLE_WIDTH = 2 # int16
|
||||
CHANNELS = 1
|
||||
CHUNK_SECONDS = 1 # stream in 1-second chunks
|
||||
|
||||
|
||||
class ElevenLabsEventHandler(AsyncEventHandler):
|
||||
def __init__(self, default_voice_id: str, default_model: str, api_key: str, speed: float, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._default_voice_id = default_voice_id
|
||||
self._default_model = default_model
|
||||
self._api_key = api_key
|
||||
self._speed = speed
|
||||
|
||||
# Send info immediately on connect
|
||||
asyncio.ensure_future(self._send_info())
|
||||
|
||||
async def _send_info(self):
|
||||
info = Info(
|
||||
tts=[
|
||||
TtsProgram(
|
||||
name="elevenlabs",
|
||||
description="ElevenLabs API TTS",
|
||||
attribution=Attribution(
|
||||
name="ElevenLabs",
|
||||
url="https://elevenlabs.io/",
|
||||
),
|
||||
installed=True,
|
||||
version="1.0.0",
|
||||
voices=[
|
||||
TtsVoice(
|
||||
name=self._default_voice_id,
|
||||
description="ElevenLabs Voice",
|
||||
attribution=Attribution(name="elevenlabs", url=""),
|
||||
installed=True,
|
||||
languages=["en-us"],
|
||||
version="1.0",
|
||||
speakers=[TtsVoiceSpeaker(name=self._default_voice_id)],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
await self.write_event(info.event())
|
||||
|
||||
async def handle_event(self, event: Event) -> bool:
|
||||
if Synthesize.is_type(event.type):
|
||||
synthesize = Synthesize.from_event(event)
|
||||
text = synthesize.text
|
||||
voice_id = self._default_voice_id
|
||||
|
||||
if synthesize.voice and synthesize.voice.name:
|
||||
voice_id = synthesize.voice.name
|
||||
|
||||
_LOGGER.debug("Synthesizing %r with voice_id=%s model=%s", text, voice_id, self._default_model)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
audio_bytes = await loop.run_in_executor(
|
||||
None, lambda: self._call_elevenlabs_api(text, voice_id)
|
||||
)
|
||||
|
||||
if audio_bytes is None:
|
||||
raise Exception("Failed to generate audio from ElevenLabs")
|
||||
|
||||
await self.write_event(
|
||||
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
|
||||
)
|
||||
|
||||
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
await self.write_event(
|
||||
AudioChunk(
|
||||
rate=SAMPLE_RATE,
|
||||
width=SAMPLE_WIDTH,
|
||||
channels=CHANNELS,
|
||||
audio=audio_bytes[i : i + chunk_size],
|
||||
).event()
|
||||
)
|
||||
|
||||
await self.write_event(AudioStop().event())
|
||||
_LOGGER.info("Synthesized audio completed")
|
||||
|
||||
except Exception:
|
||||
_LOGGER.exception("Synthesis error")
|
||||
await self.write_event(AudioStop().event())
|
||||
|
||||
return True # keep connection open
|
||||
|
||||
def _call_elevenlabs_api(self, text: str, voice_id: str) -> bytes:
|
||||
import json
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
|
||||
|
||||
headers = {
|
||||
"Accept": "audio/pcm",
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": self._api_key
|
||||
}
|
||||
|
||||
data = {
|
||||
"text": text,
|
||||
"model_id": self._default_model,
|
||||
}
|
||||
|
||||
req = request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers, method='POST')
|
||||
try:
|
||||
with request.urlopen(req) as response:
|
||||
if response.status == 200:
|
||||
return response.read()
|
||||
else:
|
||||
_LOGGER.error(f"ElevenLabs API Error: {response.status}")
|
||||
return None
|
||||
except error.HTTPError as e:
|
||||
_LOGGER.error(f"ElevenLabs HTTP Error: {e.code} - {e.read().decode('utf-8')}")
|
||||
return None
|
||||
except Exception as e:
|
||||
_LOGGER.error(f"ElevenLabs Request Error: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--uri", default="tcp://0.0.0.0:10302")
|
||||
parser.add_argument("--voice-id", default="21m00Tcm4TlvDq8ikWAM", help="Default ElevenLabs Voice ID")
|
||||
parser.add_argument("--model", default="eleven_monolingual_v1", help="ElevenLabs Model ID")
|
||||
parser.add_argument("--speed", type=float, default=1.0)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.debug else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
|
||||
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
||||
if not api_key:
|
||||
# Try to read from .env file directly if not exported in shell
|
||||
try:
|
||||
env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), '.env')
|
||||
if os.path.exists(env_path):
|
||||
with open(env_path, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('ELEVENLABS_API_KEY='):
|
||||
api_key = line.split('=', 1)[1].strip()
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not api_key:
|
||||
_LOGGER.warning("ELEVENLABS_API_KEY environment variable not set. API calls will fail.")
|
||||
|
||||
_LOGGER.info("Starting ElevenLabs Wyoming TTS on %s (voice-id=%s, model=%s)", args.uri, args.voice_id, args.model)
|
||||
|
||||
server = AsyncServer.from_uri(args.uri)
|
||||
|
||||
def handler_factory(reader, writer):
|
||||
return ElevenLabsEventHandler(args.voice_id, args.model, api_key, args.speed, reader, writer)
|
||||
|
||||
await server.run(handler_factory)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user