homeai/homeai-voice/tts/wyoming_kokoro_server.py

#!/usr/bin/env python3
"""Wyoming TTS server backed by Kokoro ONNX.

Usage:
    python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
"""

import argparse
import asyncio
import json
import logging
import os
import urllib.request

import numpy as np

from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize

_LOGGER = logging.getLogger(__name__)

ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json")
SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2  # int16
CHANNELS = 1
CHUNK_SECONDS = 1  # stream in 1-second chunks
VTUBE_BRIDGE_URL = "http://localhost:8002"
LIPSYNC_ENABLED = True
LIPSYNC_FRAME_SAMPLES = 1200  # 50ms frames at 24kHz → 20 updates/sec
LIPSYNC_SCALE = 10.0  # amplitude multiplier (tuned for Kokoro output levels)


def _send_lipsync(value: float):
    """Fire-and-forget POST to vtube-bridge with mouth open value."""
    try:
        body = json.dumps({"name": "MouthOpen", "value": value}).encode()
        req = urllib.request.Request(
            f"{VTUBE_BRIDGE_URL}/parameter",
            data=body,
            headers={"Content-Type": "application/json"},
            method="POST",
        )
        urllib.request.urlopen(req, timeout=0.5)
    except Exception:
        pass  # bridge may not be running


def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]:
    """Compute per-frame RMS amplitude scaled to 0–1 for lip sync."""
    frames = []
    for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES):
        frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32)
        rms = np.sqrt(np.mean(frame ** 2)) / 32768.0
        mouth = min(rms * LIPSYNC_SCALE, 1.0)
        frames.append(round(mouth, 3))
    return frames


def _get_active_tts_config() -> dict | None:
    """Read the active TTS config set by the OpenClaw bridge."""
    try:
        with open(ACTIVE_TTS_VOICE_PATH) as f:
            return json.load(f)
    except Exception:
        return None


def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes:
    """Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono)."""
    api_key = os.environ.get("ELEVENLABS_API_KEY", "")
    if not api_key:
        raise RuntimeError("ELEVENLABS_API_KEY not set")

    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
    payload = json.dumps({
        "text": text,
        "model_id": model,
        "voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
    }).encode()

    req = urllib.request.Request(
        url,
        data=payload,
        headers={
            "Content-Type": "application/json",
            "xi-api-key": api_key,
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read()


def _load_kokoro():
    from kokoro_onnx import Kokoro
    model_dir = os.path.expanduser("~/models/kokoro")
    return Kokoro(
        os.path.join(model_dir, "kokoro-v1.0.onnx"),
        os.path.join(model_dir, "voices-v1.0.bin"),
    )


class KokoroEventHandler(AsyncEventHandler):
    def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._tts = tts
        self._default_voice = default_voice
        self._speed = speed

        # Send info immediately on connect
        asyncio.ensure_future(self._send_info())

    async def _send_info(self):
        info = Info(
            tts=[
                TtsProgram(
                    name="kokoro",
                    description="Kokoro ONNX TTS",
                    attribution=Attribution(
                        name="thewh1teagle/kokoro-onnx",
                        url="https://github.com/thewh1teagle/kokoro-onnx",
                    ),
                    installed=True,
                    voices=[
                        TtsVoice(
                            name=self._default_voice,
                            description="Kokoro voice",
                            attribution=Attribution(name="kokoro", url=""),
                            installed=True,
                            languages=["en-us"],
                            speakers=[TtsVoiceSpeaker(name=self._default_voice)],
                        )
                    ],
                )
            ]
        )
        await self.write_event(info.event())

    async def handle_event(self, event: Event) -> bool:
        if Synthesize.is_type(event.type):
            synthesize = Synthesize.from_event(event)
            text = synthesize.text
            voice = self._default_voice
            use_elevenlabs = False

            # Bridge state file takes priority (set per-request by OpenClaw bridge)
            tts_config = _get_active_tts_config()
            if tts_config and tts_config.get("engine") == "elevenlabs":
                use_elevenlabs = True
                voice = tts_config.get("elevenlabs_voice_id", "")
                _LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice)
            elif tts_config and tts_config.get("kokoro_voice"):
                voice = tts_config["kokoro_voice"]
            elif synthesize.voice and synthesize.voice.name:
                voice = synthesize.voice.name

            try:
                loop = asyncio.get_event_loop()

                if use_elevenlabs and voice:
                    # ElevenLabs returns PCM 24kHz 16-bit mono
                    model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2")
                    _LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice)
                    pcm_bytes = await loop.run_in_executor(
                        None, lambda: _synthesize_elevenlabs(text, voice, model)
                    )
                    samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
                    audio_bytes = pcm_bytes
                else:
                    _LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed)
                    samples, sample_rate = await loop.run_in_executor(
                        None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
                    )
                    samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
                    audio_bytes = samples_int16.tobytes()

                # Pre-compute lip sync frames for the entire utterance
                lipsync_frames = []
                if LIPSYNC_ENABLED:
                    lipsync_frames = _compute_lipsync_frames(samples_int16)

                await self.write_event(
                    AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
                )

                chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
                lipsync_idx = 0
                samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS
                frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES

                for i in range(0, len(audio_bytes), chunk_size):
                    await self.write_event(
                        AudioChunk(
                            rate=SAMPLE_RATE,
                            width=SAMPLE_WIDTH,
                            channels=CHANNELS,
                            audio=audio_bytes[i : i + chunk_size],
                        ).event()
                    )

                    # Send lip sync frames for this audio chunk
                    if LIPSYNC_ENABLED and lipsync_frames:
                        chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk]
                        for mouth_val in chunk_frames:
                            await asyncio.get_event_loop().run_in_executor(
                                None, _send_lipsync, mouth_val
                            )
                        lipsync_idx += frames_per_chunk

                # Close mouth after speech
                if LIPSYNC_ENABLED:
                    await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0)

                await self.write_event(AudioStop().event())
                duration = len(samples_int16) / SAMPLE_RATE
                _LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames))

            except Exception:
                _LOGGER.exception("Synthesis error")
                await self.write_event(AudioStop().event())

        return True  # keep connection open


async def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
    parser.add_argument("--voice", default="af_heart")
    parser.add_argument("--speed", type=float, default=1.0)
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.debug else logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
    )

    _LOGGER.info("Loading Kokoro ONNX model...")
    tts = _load_kokoro()
    _LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)

    server = AsyncServer.from_uri(args.uri)

    def handler_factory(reader, writer):
        return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)

    await server.run(handler_factory)


if __name__ == "__main__":
    asyncio.run(main())