#!/usr/bin/env python3 """Wyoming TTS server backed by Kokoro ONNX. Usage: python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart """ import argparse import asyncio import json import logging import os import urllib.request import numpy as np from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.event import Event from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker from wyoming.server import AsyncEventHandler, AsyncServer from wyoming.tts import Synthesize _LOGGER = logging.getLogger(__name__) ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json") SAMPLE_RATE = 24000 SAMPLE_WIDTH = 2 # int16 CHANNELS = 1 CHUNK_SECONDS = 1 # stream in 1-second chunks VTUBE_BRIDGE_URL = "http://localhost:8002" LIPSYNC_ENABLED = True LIPSYNC_FRAME_SAMPLES = 1200 # 50ms frames at 24kHz → 20 updates/sec LIPSYNC_SCALE = 10.0 # amplitude multiplier (tuned for Kokoro output levels) def _send_lipsync(value: float): """Fire-and-forget POST to vtube-bridge with mouth open value.""" try: body = json.dumps({"name": "MouthOpen", "value": value}).encode() req = urllib.request.Request( f"{VTUBE_BRIDGE_URL}/parameter", data=body, headers={"Content-Type": "application/json"}, method="POST", ) urllib.request.urlopen(req, timeout=0.5) except Exception: pass # bridge may not be running def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]: """Compute per-frame RMS amplitude scaled to 0–1 for lip sync.""" frames = [] for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES): frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32) rms = np.sqrt(np.mean(frame ** 2)) / 32768.0 mouth = min(rms * LIPSYNC_SCALE, 1.0) frames.append(round(mouth, 3)) return frames def _get_active_tts_config() -> dict | None: """Read the active TTS config set by the OpenClaw bridge.""" try: with open(ACTIVE_TTS_VOICE_PATH) as f: return json.load(f) except Exception: return None def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes: """Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono).""" api_key = os.environ.get("ELEVENLABS_API_KEY", "") if not api_key: raise RuntimeError("ELEVENLABS_API_KEY not set") url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000" payload = json.dumps({ "text": text, "model_id": model, "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}, }).encode() req = urllib.request.Request( url, data=payload, headers={ "Content-Type": "application/json", "xi-api-key": api_key, }, method="POST", ) with urllib.request.urlopen(req, timeout=30) as resp: return resp.read() def _load_kokoro(): from kokoro_onnx import Kokoro model_dir = os.path.expanduser("~/models/kokoro") return Kokoro( os.path.join(model_dir, "kokoro-v1.0.onnx"), os.path.join(model_dir, "voices-v1.0.bin"), ) class KokoroEventHandler(AsyncEventHandler): def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs): super().__init__(*args, **kwargs) self._tts = tts self._default_voice = default_voice self._speed = speed # Send info immediately on connect asyncio.ensure_future(self._send_info()) async def _send_info(self): info = Info( tts=[ TtsProgram( name="kokoro", description="Kokoro ONNX TTS", attribution=Attribution( name="thewh1teagle/kokoro-onnx", url="https://github.com/thewh1teagle/kokoro-onnx", ), installed=True, voices=[ TtsVoice( name=self._default_voice, description="Kokoro voice", attribution=Attribution(name="kokoro", url=""), installed=True, languages=["en-us"], speakers=[TtsVoiceSpeaker(name=self._default_voice)], ) ], ) ] ) await self.write_event(info.event()) async def handle_event(self, event: Event) -> bool: if Synthesize.is_type(event.type): synthesize = Synthesize.from_event(event) text = synthesize.text voice = self._default_voice use_elevenlabs = False # Bridge state file takes priority (set per-request by OpenClaw bridge) tts_config = _get_active_tts_config() if tts_config and tts_config.get("engine") == "elevenlabs": use_elevenlabs = True voice = tts_config.get("elevenlabs_voice_id", "") _LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice) elif tts_config and tts_config.get("kokoro_voice"): voice = tts_config["kokoro_voice"] elif synthesize.voice and synthesize.voice.name: voice = synthesize.voice.name try: loop = asyncio.get_event_loop() if use_elevenlabs and voice: # ElevenLabs returns PCM 24kHz 16-bit mono model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2") _LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice) pcm_bytes = await loop.run_in_executor( None, lambda: _synthesize_elevenlabs(text, voice, model) ) samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) audio_bytes = pcm_bytes else: _LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed) samples, sample_rate = await loop.run_in_executor( None, lambda: self._tts.create(text, voice=voice, speed=self._speed) ) samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16) audio_bytes = samples_int16.tobytes() # Pre-compute lip sync frames for the entire utterance lipsync_frames = [] if LIPSYNC_ENABLED: lipsync_frames = _compute_lipsync_frames(samples_int16) await self.write_event( AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event() ) chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS lipsync_idx = 0 samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES for i in range(0, len(audio_bytes), chunk_size): await self.write_event( AudioChunk( rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS, audio=audio_bytes[i : i + chunk_size], ).event() ) # Send lip sync frames for this audio chunk if LIPSYNC_ENABLED and lipsync_frames: chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk] for mouth_val in chunk_frames: await asyncio.get_event_loop().run_in_executor( None, _send_lipsync, mouth_val ) lipsync_idx += frames_per_chunk # Close mouth after speech if LIPSYNC_ENABLED: await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0) await self.write_event(AudioStop().event()) duration = len(samples_int16) / SAMPLE_RATE _LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames)) except Exception: _LOGGER.exception("Synthesis error") await self.write_event(AudioStop().event()) return True # keep connection open async def main(): parser = argparse.ArgumentParser() parser.add_argument("--uri", default="tcp://0.0.0.0:10301") parser.add_argument("--voice", default="af_heart") parser.add_argument("--speed", type=float, default=1.0) parser.add_argument("--debug", action="store_true") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) _LOGGER.info("Loading Kokoro ONNX model...") tts = _load_kokoro() _LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice) server = AsyncServer.from_uri(args.uri) def handler_factory(reader, writer): return KokoroEventHandler(tts, args.voice, args.speed, reader, writer) await server.run(handler_factory) if __name__ == "__main__": asyncio.run(main())