homeai/homeai-voice/tts/wyoming_elevenlabs_server.py

#!/usr/bin/env python3
"""Wyoming TTS server backed by ElevenLabs.

Usage:
    python wyoming_elevenlabs_server.py --uri tcp://0.0.0.0:10302 --voice-id 21m00Tcm4TlvDq8ikWAM
"""

import argparse
import asyncio
import logging
import os
import wave
import io
from urllib import request, error

from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize

_LOGGER = logging.getLogger(__name__)

SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2  # int16
CHANNELS = 1
CHUNK_SECONDS = 1  # stream in 1-second chunks


class ElevenLabsEventHandler(AsyncEventHandler):
    def __init__(self, default_voice_id: str, default_model: str, api_key: str, speed: float, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._default_voice_id = default_voice_id
        self._default_model = default_model
        self._api_key = api_key
        self._speed = speed

        # Send info immediately on connect
        asyncio.ensure_future(self._send_info())

    async def _send_info(self):
        info = Info(
            tts=[
                TtsProgram(
                    name="elevenlabs",
                    description="ElevenLabs API TTS",
                    attribution=Attribution(
                        name="ElevenLabs",
                        url="https://elevenlabs.io/",
                    ),
                    installed=True,
                    version="1.0.0",
                    voices=[
                        TtsVoice(
                            name=self._default_voice_id,
                            description="ElevenLabs Voice",
                            attribution=Attribution(name="elevenlabs", url=""),
                            installed=True,
                            languages=["en-us"],
                            version="1.0",
                            speakers=[TtsVoiceSpeaker(name=self._default_voice_id)],
                        )
                    ],
                )
            ]
        )
        await self.write_event(info.event())

    async def handle_event(self, event: Event) -> bool:
        if Synthesize.is_type(event.type):
            synthesize = Synthesize.from_event(event)
            text = synthesize.text
            voice_id = self._default_voice_id

            if synthesize.voice and synthesize.voice.name:
                voice_id = synthesize.voice.name

            _LOGGER.debug("Synthesizing %r with voice_id=%s model=%s", text, voice_id, self._default_model)

            try:
                loop = asyncio.get_event_loop()
                audio_bytes = await loop.run_in_executor(
                    None, lambda: self._call_elevenlabs_api(text, voice_id)
                )

                if audio_bytes is None:
                    raise Exception("Failed to generate audio from ElevenLabs")

                await self.write_event(
                    AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
                )

                chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
                for i in range(0, len(audio_bytes), chunk_size):
                    await self.write_event(
                        AudioChunk(
                            rate=SAMPLE_RATE,
                            width=SAMPLE_WIDTH,
                            channels=CHANNELS,
                            audio=audio_bytes[i : i + chunk_size],
                        ).event()
                    )

                await self.write_event(AudioStop().event())
                _LOGGER.info("Synthesized audio completed")

            except Exception:
                _LOGGER.exception("Synthesis error")
                await self.write_event(AudioStop().event())

        return True  # keep connection open

    def _call_elevenlabs_api(self, text: str, voice_id: str) -> bytes:
        import json
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"

        headers = {
            "Accept": "audio/pcm",
            "Content-Type": "application/json",
            "xi-api-key": self._api_key
        }

        data = {
            "text": text,
            "model_id": self._default_model,
        }

        req = request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers, method='POST')
        try:
            with request.urlopen(req) as response:
                if response.status == 200:
                    return response.read()
                else:
                    _LOGGER.error(f"ElevenLabs API Error: {response.status}")
                    return None
        except error.HTTPError as e:
            _LOGGER.error(f"ElevenLabs HTTP Error: {e.code} - {e.read().decode('utf-8')}")
            return None
        except Exception as e:
            _LOGGER.error(f"ElevenLabs Request Error: {str(e)}")
            return None


async def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--uri", default="tcp://0.0.0.0:10302")
    parser.add_argument("--voice-id", default="21m00Tcm4TlvDq8ikWAM", help="Default ElevenLabs Voice ID")
    parser.add_argument("--model", default="eleven_monolingual_v1", help="ElevenLabs Model ID")
    parser.add_argument("--speed", type=float, default=1.0)
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.debug else logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
    )

    api_key = os.environ.get("ELEVENLABS_API_KEY")
    if not api_key:
        # Try to read from .env file directly if not exported in shell
        try:
            env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), '.env')
            if os.path.exists(env_path):
                with open(env_path, 'r') as f:
                    for line in f:
                        if line.startswith('ELEVENLABS_API_KEY='):
                            api_key = line.split('=', 1)[1].strip()
                            break
        except Exception:
            pass

    if not api_key:
        _LOGGER.warning("ELEVENLABS_API_KEY environment variable not set. API calls will fail.")

    _LOGGER.info("Starting ElevenLabs Wyoming TTS on %s (voice-id=%s, model=%s)", args.uri, args.voice_id, args.model)

    server = AsyncServer.from_uri(args.uri)

    def handler_factory(reader, writer):
        return ElevenLabsEventHandler(args.voice_id, args.model, api_key, args.speed, reader, writer)

    await server.run(handler_factory)


if __name__ == "__main__":
    asyncio.run(main())