Character schema v2: background, dialogue_style, appearance, skills, gaze_presets with automatic v1→v2 migration. LLM-assisted character creation via Character MCP server. Two-tier memory system (personal per-character + general shared) with budget-based injection into LLM system prompt. Per-character TTS voice routing via state file — Wyoming TTS server reads active config to route between Kokoro (local) and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history, character profile on cards, auto-TTS engine selection from character config. Also includes VTube Studio expression bridge and ComfyUI API guide. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
255 lines
9.5 KiB
Python
255 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
||
"""Wyoming TTS server backed by Kokoro ONNX.
|
||
|
||
Usage:
|
||
python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
|
||
"""
|
||
|
||
import argparse
|
||
import asyncio
|
||
import json
|
||
import logging
|
||
import os
|
||
import urllib.request
|
||
|
||
import numpy as np
|
||
|
||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||
from wyoming.event import Event
|
||
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
|
||
from wyoming.server import AsyncEventHandler, AsyncServer
|
||
from wyoming.tts import Synthesize
|
||
|
||
_LOGGER = logging.getLogger(__name__)
|
||
|
||
ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json")
|
||
SAMPLE_RATE = 24000
|
||
SAMPLE_WIDTH = 2 # int16
|
||
CHANNELS = 1
|
||
CHUNK_SECONDS = 1 # stream in 1-second chunks
|
||
VTUBE_BRIDGE_URL = "http://localhost:8002"
|
||
LIPSYNC_ENABLED = True
|
||
LIPSYNC_FRAME_SAMPLES = 1200 # 50ms frames at 24kHz → 20 updates/sec
|
||
LIPSYNC_SCALE = 10.0 # amplitude multiplier (tuned for Kokoro output levels)
|
||
|
||
|
||
def _send_lipsync(value: float):
|
||
"""Fire-and-forget POST to vtube-bridge with mouth open value."""
|
||
try:
|
||
body = json.dumps({"name": "MouthOpen", "value": value}).encode()
|
||
req = urllib.request.Request(
|
||
f"{VTUBE_BRIDGE_URL}/parameter",
|
||
data=body,
|
||
headers={"Content-Type": "application/json"},
|
||
method="POST",
|
||
)
|
||
urllib.request.urlopen(req, timeout=0.5)
|
||
except Exception:
|
||
pass # bridge may not be running
|
||
|
||
|
||
def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]:
|
||
"""Compute per-frame RMS amplitude scaled to 0–1 for lip sync."""
|
||
frames = []
|
||
for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES):
|
||
frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32)
|
||
rms = np.sqrt(np.mean(frame ** 2)) / 32768.0
|
||
mouth = min(rms * LIPSYNC_SCALE, 1.0)
|
||
frames.append(round(mouth, 3))
|
||
return frames
|
||
|
||
|
||
def _get_active_tts_config() -> dict | None:
|
||
"""Read the active TTS config set by the OpenClaw bridge."""
|
||
try:
|
||
with open(ACTIVE_TTS_VOICE_PATH) as f:
|
||
return json.load(f)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes:
|
||
"""Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono)."""
|
||
api_key = os.environ.get("ELEVENLABS_API_KEY", "")
|
||
if not api_key:
|
||
raise RuntimeError("ELEVENLABS_API_KEY not set")
|
||
|
||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
|
||
payload = json.dumps({
|
||
"text": text,
|
||
"model_id": model,
|
||
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
|
||
}).encode()
|
||
|
||
req = urllib.request.Request(
|
||
url,
|
||
data=payload,
|
||
headers={
|
||
"Content-Type": "application/json",
|
||
"xi-api-key": api_key,
|
||
},
|
||
method="POST",
|
||
)
|
||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||
return resp.read()
|
||
|
||
|
||
def _load_kokoro():
|
||
from kokoro_onnx import Kokoro
|
||
model_dir = os.path.expanduser("~/models/kokoro")
|
||
return Kokoro(
|
||
os.path.join(model_dir, "kokoro-v1.0.onnx"),
|
||
os.path.join(model_dir, "voices-v1.0.bin"),
|
||
)
|
||
|
||
|
||
class KokoroEventHandler(AsyncEventHandler):
|
||
def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
|
||
super().__init__(*args, **kwargs)
|
||
self._tts = tts
|
||
self._default_voice = default_voice
|
||
self._speed = speed
|
||
|
||
# Send info immediately on connect
|
||
asyncio.ensure_future(self._send_info())
|
||
|
||
async def _send_info(self):
|
||
info = Info(
|
||
tts=[
|
||
TtsProgram(
|
||
name="kokoro",
|
||
description="Kokoro ONNX TTS",
|
||
attribution=Attribution(
|
||
name="thewh1teagle/kokoro-onnx",
|
||
url="https://github.com/thewh1teagle/kokoro-onnx",
|
||
),
|
||
installed=True,
|
||
voices=[
|
||
TtsVoice(
|
||
name=self._default_voice,
|
||
description="Kokoro voice",
|
||
attribution=Attribution(name="kokoro", url=""),
|
||
installed=True,
|
||
languages=["en-us"],
|
||
speakers=[TtsVoiceSpeaker(name=self._default_voice)],
|
||
)
|
||
],
|
||
)
|
||
]
|
||
)
|
||
await self.write_event(info.event())
|
||
|
||
async def handle_event(self, event: Event) -> bool:
|
||
if Synthesize.is_type(event.type):
|
||
synthesize = Synthesize.from_event(event)
|
||
text = synthesize.text
|
||
voice = self._default_voice
|
||
use_elevenlabs = False
|
||
|
||
# Bridge state file takes priority (set per-request by OpenClaw bridge)
|
||
tts_config = _get_active_tts_config()
|
||
if tts_config and tts_config.get("engine") == "elevenlabs":
|
||
use_elevenlabs = True
|
||
voice = tts_config.get("elevenlabs_voice_id", "")
|
||
_LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice)
|
||
elif tts_config and tts_config.get("kokoro_voice"):
|
||
voice = tts_config["kokoro_voice"]
|
||
elif synthesize.voice and synthesize.voice.name:
|
||
voice = synthesize.voice.name
|
||
|
||
try:
|
||
loop = asyncio.get_event_loop()
|
||
|
||
if use_elevenlabs and voice:
|
||
# ElevenLabs returns PCM 24kHz 16-bit mono
|
||
model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2")
|
||
_LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice)
|
||
pcm_bytes = await loop.run_in_executor(
|
||
None, lambda: _synthesize_elevenlabs(text, voice, model)
|
||
)
|
||
samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||
audio_bytes = pcm_bytes
|
||
else:
|
||
_LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed)
|
||
samples, sample_rate = await loop.run_in_executor(
|
||
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
|
||
)
|
||
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
|
||
audio_bytes = samples_int16.tobytes()
|
||
|
||
# Pre-compute lip sync frames for the entire utterance
|
||
lipsync_frames = []
|
||
if LIPSYNC_ENABLED:
|
||
lipsync_frames = _compute_lipsync_frames(samples_int16)
|
||
|
||
await self.write_event(
|
||
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
|
||
)
|
||
|
||
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
|
||
lipsync_idx = 0
|
||
samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS
|
||
frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES
|
||
|
||
for i in range(0, len(audio_bytes), chunk_size):
|
||
await self.write_event(
|
||
AudioChunk(
|
||
rate=SAMPLE_RATE,
|
||
width=SAMPLE_WIDTH,
|
||
channels=CHANNELS,
|
||
audio=audio_bytes[i : i + chunk_size],
|
||
).event()
|
||
)
|
||
|
||
# Send lip sync frames for this audio chunk
|
||
if LIPSYNC_ENABLED and lipsync_frames:
|
||
chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk]
|
||
for mouth_val in chunk_frames:
|
||
await asyncio.get_event_loop().run_in_executor(
|
||
None, _send_lipsync, mouth_val
|
||
)
|
||
lipsync_idx += frames_per_chunk
|
||
|
||
# Close mouth after speech
|
||
if LIPSYNC_ENABLED:
|
||
await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0)
|
||
|
||
await self.write_event(AudioStop().event())
|
||
duration = len(samples_int16) / SAMPLE_RATE
|
||
_LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames))
|
||
|
||
except Exception:
|
||
_LOGGER.exception("Synthesis error")
|
||
await self.write_event(AudioStop().event())
|
||
|
||
return True # keep connection open
|
||
|
||
|
||
async def main():
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
|
||
parser.add_argument("--voice", default="af_heart")
|
||
parser.add_argument("--speed", type=float, default=1.0)
|
||
parser.add_argument("--debug", action="store_true")
|
||
args = parser.parse_args()
|
||
|
||
logging.basicConfig(
|
||
level=logging.DEBUG if args.debug else logging.INFO,
|
||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||
)
|
||
|
||
_LOGGER.info("Loading Kokoro ONNX model...")
|
||
tts = _load_kokoro()
|
||
_LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)
|
||
|
||
server = AsyncServer.from_uri(args.uri)
|
||
|
||
def handler_factory(reader, writer):
|
||
return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)
|
||
|
||
await server.run(handler_factory)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|