#!/usr/bin/env python3 """Wyoming TTS server backed by ElevenLabs. Usage: python wyoming_elevenlabs_server.py --uri tcp://0.0.0.0:10302 --voice-id 21m00Tcm4TlvDq8ikWAM """ import argparse import asyncio import logging import os import wave import io from urllib import request, error from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.event import Event from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker from wyoming.server import AsyncEventHandler, AsyncServer from wyoming.tts import Synthesize _LOGGER = logging.getLogger(__name__) SAMPLE_RATE = 24000 SAMPLE_WIDTH = 2 # int16 CHANNELS = 1 CHUNK_SECONDS = 1 # stream in 1-second chunks class ElevenLabsEventHandler(AsyncEventHandler): def __init__(self, default_voice_id: str, default_model: str, api_key: str, speed: float, *args, **kwargs): super().__init__(*args, **kwargs) self._default_voice_id = default_voice_id self._default_model = default_model self._api_key = api_key self._speed = speed # Send info immediately on connect asyncio.ensure_future(self._send_info()) async def _send_info(self): info = Info( tts=[ TtsProgram( name="elevenlabs", description="ElevenLabs API TTS", attribution=Attribution( name="ElevenLabs", url="https://elevenlabs.io/", ), installed=True, version="1.0.0", voices=[ TtsVoice( name=self._default_voice_id, description="ElevenLabs Voice", attribution=Attribution(name="elevenlabs", url=""), installed=True, languages=["en-us"], version="1.0", speakers=[TtsVoiceSpeaker(name=self._default_voice_id)], ) ], ) ] ) await self.write_event(info.event()) async def handle_event(self, event: Event) -> bool: if Synthesize.is_type(event.type): synthesize = Synthesize.from_event(event) text = synthesize.text voice_id = self._default_voice_id if synthesize.voice and synthesize.voice.name: voice_id = synthesize.voice.name _LOGGER.debug("Synthesizing %r with voice_id=%s model=%s", text, voice_id, self._default_model) try: loop = asyncio.get_event_loop() audio_bytes = await loop.run_in_executor( None, lambda: self._call_elevenlabs_api(text, voice_id) ) if audio_bytes is None: raise Exception("Failed to generate audio from ElevenLabs") await self.write_event( AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event() ) chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS for i in range(0, len(audio_bytes), chunk_size): await self.write_event( AudioChunk( rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS, audio=audio_bytes[i : i + chunk_size], ).event() ) await self.write_event(AudioStop().event()) _LOGGER.info("Synthesized audio completed") except Exception: _LOGGER.exception("Synthesis error") await self.write_event(AudioStop().event()) return True # keep connection open def _call_elevenlabs_api(self, text: str, voice_id: str) -> bytes: import json url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000" headers = { "Accept": "audio/pcm", "Content-Type": "application/json", "xi-api-key": self._api_key } data = { "text": text, "model_id": self._default_model, } req = request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers, method='POST') try: with request.urlopen(req) as response: if response.status == 200: return response.read() else: _LOGGER.error(f"ElevenLabs API Error: {response.status}") return None except error.HTTPError as e: _LOGGER.error(f"ElevenLabs HTTP Error: {e.code} - {e.read().decode('utf-8')}") return None except Exception as e: _LOGGER.error(f"ElevenLabs Request Error: {str(e)}") return None async def main(): parser = argparse.ArgumentParser() parser.add_argument("--uri", default="tcp://0.0.0.0:10302") parser.add_argument("--voice-id", default="21m00Tcm4TlvDq8ikWAM", help="Default ElevenLabs Voice ID") parser.add_argument("--model", default="eleven_monolingual_v1", help="ElevenLabs Model ID") parser.add_argument("--speed", type=float, default=1.0) parser.add_argument("--debug", action="store_true") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) api_key = os.environ.get("ELEVENLABS_API_KEY") if not api_key: # Try to read from .env file directly if not exported in shell try: env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), '.env') if os.path.exists(env_path): with open(env_path, 'r') as f: for line in f: if line.startswith('ELEVENLABS_API_KEY='): api_key = line.split('=', 1)[1].strip() break except Exception: pass if not api_key: _LOGGER.warning("ELEVENLABS_API_KEY environment variable not set. API calls will fail.") _LOGGER.info("Starting ElevenLabs Wyoming TTS on %s (voice-id=%s, model=%s)", args.uri, args.voice_id, args.model) server = AsyncServer.from_uri(args.uri) def handler_factory(reader, writer): return ElevenLabsEventHandler(args.voice_id, args.model, api_key, args.speed, reader, writer) await server.run(handler_factory) if __name__ == "__main__": asyncio.run(main())