#!/usr/bin/env python3 """Wyoming TTS server backed by Kokoro ONNX. Usage: python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart """ import argparse import asyncio import logging import os import numpy as np from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.event import Event from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker from wyoming.server import AsyncEventHandler, AsyncServer from wyoming.tts import Synthesize _LOGGER = logging.getLogger(__name__) SAMPLE_RATE = 24000 SAMPLE_WIDTH = 2 # int16 CHANNELS = 1 CHUNK_SECONDS = 1 # stream in 1-second chunks def _load_kokoro(): from kokoro_onnx import Kokoro model_dir = os.path.expanduser("~/models/kokoro") return Kokoro( os.path.join(model_dir, "kokoro-v1.0.onnx"), os.path.join(model_dir, "voices-v1.0.bin"), ) class KokoroEventHandler(AsyncEventHandler): def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs): super().__init__(*args, **kwargs) self._tts = tts self._default_voice = default_voice self._speed = speed # Send info immediately on connect asyncio.ensure_future(self._send_info()) async def _send_info(self): info = Info( tts=[ TtsProgram( name="kokoro", description="Kokoro ONNX TTS", attribution=Attribution( name="thewh1teagle/kokoro-onnx", url="https://github.com/thewh1teagle/kokoro-onnx", ), installed=True, voices=[ TtsVoice( name=self._default_voice, description="Kokoro voice", attribution=Attribution(name="kokoro", url=""), installed=True, languages=["en-us"], speakers=[TtsVoiceSpeaker(name=self._default_voice)], ) ], ) ] ) await self.write_event(info.event()) async def handle_event(self, event: Event) -> bool: if Synthesize.is_type(event.type): synthesize = Synthesize.from_event(event) text = synthesize.text voice = self._default_voice if synthesize.voice and synthesize.voice.name: voice = synthesize.voice.name _LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed) try: loop = asyncio.get_event_loop() samples, sample_rate = await loop.run_in_executor( None, lambda: self._tts.create(text, voice=voice, speed=self._speed) ) samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16) audio_bytes = samples_int16.tobytes() await self.write_event( AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event() ) chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS for i in range(0, len(audio_bytes), chunk_size): await self.write_event( AudioChunk( rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS, audio=audio_bytes[i : i + chunk_size], ).event() ) await self.write_event(AudioStop().event()) _LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate) except Exception: _LOGGER.exception("Synthesis error") await self.write_event(AudioStop().event()) return True # keep connection open async def main(): parser = argparse.ArgumentParser() parser.add_argument("--uri", default="tcp://0.0.0.0:10301") parser.add_argument("--voice", default="af_heart") parser.add_argument("--speed", type=float, default=1.0) parser.add_argument("--debug", action="store_true") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) _LOGGER.info("Loading Kokoro ONNX model...") tts = _load_kokoro() _LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice) server = AsyncServer.from_uri(args.uri) def handler_factory(reader, writer): return KokoroEventHandler(tts, args.voice, args.speed, reader, writer) await server.run(handler_factory) if __name__ == "__main__": asyncio.run(main())