feat: character system v2 — schema upgrade, memory system, per-character TTS routing
Character schema v2: background, dialogue_style, appearance, skills, gaze_presets with automatic v1→v2 migration. LLM-assisted character creation via Character MCP server. Two-tier memory system (personal per-character + general shared) with budget-based injection into LLM system prompt. Per-character TTS voice routing via state file — Wyoming TTS server reads active config to route between Kokoro (local) and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history, character profile on cards, auto-TTS engine selection from character config. Also includes VTube Studio expression bridge and ComfyUI API guide. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,8 +7,10 @@ Usage:
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -20,10 +22,76 @@ from wyoming.tts import Synthesize
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json")
|
||||
SAMPLE_RATE = 24000
|
||||
SAMPLE_WIDTH = 2 # int16
|
||||
CHANNELS = 1
|
||||
CHUNK_SECONDS = 1 # stream in 1-second chunks
|
||||
VTUBE_BRIDGE_URL = "http://localhost:8002"
|
||||
LIPSYNC_ENABLED = True
|
||||
LIPSYNC_FRAME_SAMPLES = 1200 # 50ms frames at 24kHz → 20 updates/sec
|
||||
LIPSYNC_SCALE = 10.0 # amplitude multiplier (tuned for Kokoro output levels)
|
||||
|
||||
|
||||
def _send_lipsync(value: float):
|
||||
"""Fire-and-forget POST to vtube-bridge with mouth open value."""
|
||||
try:
|
||||
body = json.dumps({"name": "MouthOpen", "value": value}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{VTUBE_BRIDGE_URL}/parameter",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
urllib.request.urlopen(req, timeout=0.5)
|
||||
except Exception:
|
||||
pass # bridge may not be running
|
||||
|
||||
|
||||
def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]:
|
||||
"""Compute per-frame RMS amplitude scaled to 0–1 for lip sync."""
|
||||
frames = []
|
||||
for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES):
|
||||
frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32)
|
||||
rms = np.sqrt(np.mean(frame ** 2)) / 32768.0
|
||||
mouth = min(rms * LIPSYNC_SCALE, 1.0)
|
||||
frames.append(round(mouth, 3))
|
||||
return frames
|
||||
|
||||
|
||||
def _get_active_tts_config() -> dict | None:
|
||||
"""Read the active TTS config set by the OpenClaw bridge."""
|
||||
try:
|
||||
with open(ACTIVE_TTS_VOICE_PATH) as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes:
|
||||
"""Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono)."""
|
||||
api_key = os.environ.get("ELEVENLABS_API_KEY", "")
|
||||
if not api_key:
|
||||
raise RuntimeError("ELEVENLABS_API_KEY not set")
|
||||
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
|
||||
payload = json.dumps({
|
||||
"text": text,
|
||||
"model_id": model,
|
||||
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=payload,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": api_key,
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read()
|
||||
|
||||
|
||||
def _load_kokoro():
|
||||
@@ -76,26 +144,53 @@ class KokoroEventHandler(AsyncEventHandler):
|
||||
synthesize = Synthesize.from_event(event)
|
||||
text = synthesize.text
|
||||
voice = self._default_voice
|
||||
use_elevenlabs = False
|
||||
|
||||
if synthesize.voice and synthesize.voice.name:
|
||||
# Bridge state file takes priority (set per-request by OpenClaw bridge)
|
||||
tts_config = _get_active_tts_config()
|
||||
if tts_config and tts_config.get("engine") == "elevenlabs":
|
||||
use_elevenlabs = True
|
||||
voice = tts_config.get("elevenlabs_voice_id", "")
|
||||
_LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice)
|
||||
elif tts_config and tts_config.get("kokoro_voice"):
|
||||
voice = tts_config["kokoro_voice"]
|
||||
elif synthesize.voice and synthesize.voice.name:
|
||||
voice = synthesize.voice.name
|
||||
|
||||
_LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
samples, sample_rate = await loop.run_in_executor(
|
||||
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
|
||||
)
|
||||
|
||||
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
|
||||
audio_bytes = samples_int16.tobytes()
|
||||
if use_elevenlabs and voice:
|
||||
# ElevenLabs returns PCM 24kHz 16-bit mono
|
||||
model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2")
|
||||
_LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice)
|
||||
pcm_bytes = await loop.run_in_executor(
|
||||
None, lambda: _synthesize_elevenlabs(text, voice, model)
|
||||
)
|
||||
samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
audio_bytes = pcm_bytes
|
||||
else:
|
||||
_LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed)
|
||||
samples, sample_rate = await loop.run_in_executor(
|
||||
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
|
||||
)
|
||||
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
|
||||
audio_bytes = samples_int16.tobytes()
|
||||
|
||||
# Pre-compute lip sync frames for the entire utterance
|
||||
lipsync_frames = []
|
||||
if LIPSYNC_ENABLED:
|
||||
lipsync_frames = _compute_lipsync_frames(samples_int16)
|
||||
|
||||
await self.write_event(
|
||||
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
|
||||
)
|
||||
|
||||
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
|
||||
lipsync_idx = 0
|
||||
samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS
|
||||
frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES
|
||||
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
await self.write_event(
|
||||
AudioChunk(
|
||||
@@ -106,8 +201,22 @@ class KokoroEventHandler(AsyncEventHandler):
|
||||
).event()
|
||||
)
|
||||
|
||||
# Send lip sync frames for this audio chunk
|
||||
if LIPSYNC_ENABLED and lipsync_frames:
|
||||
chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk]
|
||||
for mouth_val in chunk_frames:
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, _send_lipsync, mouth_val
|
||||
)
|
||||
lipsync_idx += frames_per_chunk
|
||||
|
||||
# Close mouth after speech
|
||||
if LIPSYNC_ENABLED:
|
||||
await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0)
|
||||
|
||||
await self.write_event(AudioStop().event())
|
||||
_LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
|
||||
duration = len(samples_int16) / SAMPLE_RATE
|
||||
_LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames))
|
||||
|
||||
except Exception:
|
||||
_LOGGER.exception("Synthesis error")
|
||||
|
||||
Reference in New Issue
Block a user