feat: character system v2 — schema upgrade, memory system, per-character TTS routing

Character schema v2: background, dialogue_style, appearance, skills, gaze_presets
with automatic v1→v2 migration. LLM-assisted character creation via Character MCP
server. Two-tier memory system (personal per-character + general shared) with
budget-based injection into LLM system prompt. Per-character TTS voice routing via
state file — Wyoming TTS server reads active config to route between Kokoro (local)
and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history,
character profile on cards, auto-TTS engine selection from character config.
Also includes VTube Studio expression bridge and ComfyUI API guide.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-17 19:15:46 +00:00
parent 1e52c002c2
commit 60eb89ea42
39 changed files with 3846 additions and 409 deletions

View File

@@ -18,6 +18,12 @@
<string>1.0</string>
</array>
<key>EnvironmentVariables</key>
<dict>
<key>ELEVENLABS_API_KEY</key>
<string>sk_ec10e261c6190307a37aa161a9583504dcf25a0cabe5dbd5</string>
</dict>
<key>RunAtLoad</key>
<true/>

View File

@@ -7,8 +7,10 @@ Usage:
import argparse
import asyncio
import json
import logging
import os
import urllib.request
import numpy as np
@@ -20,10 +22,76 @@ from wyoming.tts import Synthesize
_LOGGER = logging.getLogger(__name__)
ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json")
SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2 # int16
CHANNELS = 1
CHUNK_SECONDS = 1 # stream in 1-second chunks
VTUBE_BRIDGE_URL = "http://localhost:8002"
LIPSYNC_ENABLED = True
LIPSYNC_FRAME_SAMPLES = 1200 # 50ms frames at 24kHz → 20 updates/sec
LIPSYNC_SCALE = 10.0 # amplitude multiplier (tuned for Kokoro output levels)
def _send_lipsync(value: float):
"""Fire-and-forget POST to vtube-bridge with mouth open value."""
try:
body = json.dumps({"name": "MouthOpen", "value": value}).encode()
req = urllib.request.Request(
f"{VTUBE_BRIDGE_URL}/parameter",
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
urllib.request.urlopen(req, timeout=0.5)
except Exception:
pass # bridge may not be running
def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]:
"""Compute per-frame RMS amplitude scaled to 01 for lip sync."""
frames = []
for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES):
frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32)
rms = np.sqrt(np.mean(frame ** 2)) / 32768.0
mouth = min(rms * LIPSYNC_SCALE, 1.0)
frames.append(round(mouth, 3))
return frames
def _get_active_tts_config() -> dict | None:
"""Read the active TTS config set by the OpenClaw bridge."""
try:
with open(ACTIVE_TTS_VOICE_PATH) as f:
return json.load(f)
except Exception:
return None
def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes:
"""Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono)."""
api_key = os.environ.get("ELEVENLABS_API_KEY", "")
if not api_key:
raise RuntimeError("ELEVENLABS_API_KEY not set")
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
payload = json.dumps({
"text": text,
"model_id": model,
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
}).encode()
req = urllib.request.Request(
url,
data=payload,
headers={
"Content-Type": "application/json",
"xi-api-key": api_key,
},
method="POST",
)
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
def _load_kokoro():
@@ -76,26 +144,53 @@ class KokoroEventHandler(AsyncEventHandler):
synthesize = Synthesize.from_event(event)
text = synthesize.text
voice = self._default_voice
use_elevenlabs = False
if synthesize.voice and synthesize.voice.name:
# Bridge state file takes priority (set per-request by OpenClaw bridge)
tts_config = _get_active_tts_config()
if tts_config and tts_config.get("engine") == "elevenlabs":
use_elevenlabs = True
voice = tts_config.get("elevenlabs_voice_id", "")
_LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice)
elif tts_config and tts_config.get("kokoro_voice"):
voice = tts_config["kokoro_voice"]
elif synthesize.voice and synthesize.voice.name:
voice = synthesize.voice.name
_LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
try:
loop = asyncio.get_event_loop()
samples, sample_rate = await loop.run_in_executor(
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
)
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
audio_bytes = samples_int16.tobytes()
if use_elevenlabs and voice:
# ElevenLabs returns PCM 24kHz 16-bit mono
model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2")
_LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice)
pcm_bytes = await loop.run_in_executor(
None, lambda: _synthesize_elevenlabs(text, voice, model)
)
samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
audio_bytes = pcm_bytes
else:
_LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed)
samples, sample_rate = await loop.run_in_executor(
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
)
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
audio_bytes = samples_int16.tobytes()
# Pre-compute lip sync frames for the entire utterance
lipsync_frames = []
if LIPSYNC_ENABLED:
lipsync_frames = _compute_lipsync_frames(samples_int16)
await self.write_event(
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
)
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
lipsync_idx = 0
samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS
frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES
for i in range(0, len(audio_bytes), chunk_size):
await self.write_event(
AudioChunk(
@@ -106,8 +201,22 @@ class KokoroEventHandler(AsyncEventHandler):
).event()
)
# Send lip sync frames for this audio chunk
if LIPSYNC_ENABLED and lipsync_frames:
chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk]
for mouth_val in chunk_frames:
await asyncio.get_event_loop().run_in_executor(
None, _send_lipsync, mouth_val
)
lipsync_idx += frames_per_chunk
# Close mouth after speech
if LIPSYNC_ENABLED:
await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0)
await self.write_event(AudioStop().event())
_LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
duration = len(samples_int16) / SAMPLE_RATE
_LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames))
except Exception:
_LOGGER.exception("Synthesis error")