feat: character system v2 — schema upgrade, memory system, per-character TTS routing

Character schema v2: background, dialogue_style, appearance, skills, gaze_presets with automatic v1→v2 migration. LLM-assisted character creation via Character MCP server. Two-tier memory system (personal per-character + general shared) with budget-based injection into LLM system prompt. Per-character TTS voice routing via state file — Wyoming TTS server reads active config to route between Kokoro (local) and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history, character profile on cards, auto-TTS engine selection from character config. Also includes VTube Studio expression bridge and ComfyUI API guide. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 19:15:46 +00:00
parent 1e52c002c2
commit 60eb89ea42
39 changed files with 3846 additions and 409 deletions
--- a/homeai-voice/scripts/launchd/com.homeai.wyoming-tts.plist
+++ b/homeai-voice/scripts/launchd/com.homeai.wyoming-tts.plist
@@ -18,6 +18,12 @@
    <string>1.0</string>
  </array>

+  <key>EnvironmentVariables</key>
+  <dict>
+    <key>ELEVENLABS_API_KEY</key>
+    <string>sk_ec10e261c6190307a37aa161a9583504dcf25a0cabe5dbd5</string>
+  </dict>
+
  <key>RunAtLoad</key>
  <true/>

--- a/homeai-voice/tts/wyoming_kokoro_server.py
+++ b/homeai-voice/tts/wyoming_kokoro_server.py
@@ -7,8 +7,10 @@ Usage:

 import argparse
 import asyncio
+import json
 import logging
 import os
+import urllib.request

 import numpy as np

@@ -20,10 +22,76 @@ from wyoming.tts import Synthesize

 _LOGGER = logging.getLogger(__name__)

+ACTIVE_TTS_VOICE_PATH = os.path.expanduser("~/homeai-data/active-tts-voice.json")
 SAMPLE_RATE = 24000
 SAMPLE_WIDTH = 2  # int16
 CHANNELS = 1
 CHUNK_SECONDS = 1  # stream in 1-second chunks
+VTUBE_BRIDGE_URL = "http://localhost:8002"
+LIPSYNC_ENABLED = True
+LIPSYNC_FRAME_SAMPLES = 1200  # 50ms frames at 24kHz → 20 updates/sec
+LIPSYNC_SCALE = 10.0  # amplitude multiplier (tuned for Kokoro output levels)
+
+
+def _send_lipsync(value: float):
+    """Fire-and-forget POST to vtube-bridge with mouth open value."""
+    try:
+        body = json.dumps({"name": "MouthOpen", "value": value}).encode()
+        req = urllib.request.Request(
+            f"{VTUBE_BRIDGE_URL}/parameter",
+            data=body,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        urllib.request.urlopen(req, timeout=0.5)
+    except Exception:
+        pass  # bridge may not be running
+
+
+def _compute_lipsync_frames(samples_int16: np.ndarray) -> list[float]:
+    """Compute per-frame RMS amplitude scaled to 0–1 for lip sync."""
+    frames = []
+    for i in range(0, len(samples_int16), LIPSYNC_FRAME_SAMPLES):
+        frame = samples_int16[i : i + LIPSYNC_FRAME_SAMPLES].astype(np.float32)
+        rms = np.sqrt(np.mean(frame ** 2)) / 32768.0
+        mouth = min(rms * LIPSYNC_SCALE, 1.0)
+        frames.append(round(mouth, 3))
+    return frames
+
+
+def _get_active_tts_config() -> dict | None:
+    """Read the active TTS config set by the OpenClaw bridge."""
+    try:
+        with open(ACTIVE_TTS_VOICE_PATH) as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def _synthesize_elevenlabs(text: str, voice_id: str, model: str = "eleven_multilingual_v2") -> bytes:
+    """Call ElevenLabs TTS API and return raw PCM audio bytes (24kHz 16-bit mono)."""
+    api_key = os.environ.get("ELEVENLABS_API_KEY", "")
+    if not api_key:
+        raise RuntimeError("ELEVENLABS_API_KEY not set")
+
+    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}?output_format=pcm_24000"
+    payload = json.dumps({
+        "text": text,
+        "model_id": model,
+        "voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
+    }).encode()
+
+    req = urllib.request.Request(
+        url,
+        data=payload,
+        headers={
+            "Content-Type": "application/json",
+            "xi-api-key": api_key,
+        },
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return resp.read()


 def _load_kokoro():
@@ -76,26 +144,53 @@ class KokoroEventHandler(AsyncEventHandler):
            synthesize = Synthesize.from_event(event)
            text = synthesize.text
            voice = self._default_voice
+            use_elevenlabs = False

-            if synthesize.voice and synthesize.voice.name:
+            # Bridge state file takes priority (set per-request by OpenClaw bridge)
+            tts_config = _get_active_tts_config()
+            if tts_config and tts_config.get("engine") == "elevenlabs":
+                use_elevenlabs = True
+                voice = tts_config.get("elevenlabs_voice_id", "")
+                _LOGGER.debug("Synthesizing %r with ElevenLabs voice=%s", text, voice)
+            elif tts_config and tts_config.get("kokoro_voice"):
+                voice = tts_config["kokoro_voice"]
+            elif synthesize.voice and synthesize.voice.name:
                voice = synthesize.voice.name

-            _LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
-
            try:
                loop = asyncio.get_event_loop()
-                samples, sample_rate = await loop.run_in_executor(
-                    None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
-                )

-                samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
-                audio_bytes = samples_int16.tobytes()
+                if use_elevenlabs and voice:
+                    # ElevenLabs returns PCM 24kHz 16-bit mono
+                    model = tts_config.get("elevenlabs_model", "eleven_multilingual_v2")
+                    _LOGGER.info("Using ElevenLabs TTS (model=%s, voice=%s)", model, voice)
+                    pcm_bytes = await loop.run_in_executor(
+                        None, lambda: _synthesize_elevenlabs(text, voice, model)
+                    )
+                    samples_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
+                    audio_bytes = pcm_bytes
+                else:
+                    _LOGGER.debug("Synthesizing %r with Kokoro voice=%s speed=%.1f", text, voice, self._speed)
+                    samples, sample_rate = await loop.run_in_executor(
+                        None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
+                    )
+                    samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
+                    audio_bytes = samples_int16.tobytes()
+
+                # Pre-compute lip sync frames for the entire utterance
+                lipsync_frames = []
+                if LIPSYNC_ENABLED:
+                    lipsync_frames = _compute_lipsync_frames(samples_int16)

                await self.write_event(
                    AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
                )

                chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
+                lipsync_idx = 0
+                samples_per_chunk = SAMPLE_RATE * CHUNK_SECONDS
+                frames_per_chunk = samples_per_chunk // LIPSYNC_FRAME_SAMPLES
+
                for i in range(0, len(audio_bytes), chunk_size):
                    await self.write_event(
                        AudioChunk(
@@ -106,8 +201,22 @@ class KokoroEventHandler(AsyncEventHandler):
                        ).event()
                    )

+                    # Send lip sync frames for this audio chunk
+                    if LIPSYNC_ENABLED and lipsync_frames:
+                        chunk_frames = lipsync_frames[lipsync_idx : lipsync_idx + frames_per_chunk]
+                        for mouth_val in chunk_frames:
+                            await asyncio.get_event_loop().run_in_executor(
+                                None, _send_lipsync, mouth_val
+                            )
+                        lipsync_idx += frames_per_chunk
+
+                # Close mouth after speech
+                if LIPSYNC_ENABLED:
+                    await asyncio.get_event_loop().run_in_executor(None, _send_lipsync, 0.0)
+
                await self.write_event(AudioStop().event())
-                _LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
+                duration = len(samples_int16) / SAMPLE_RATE
+                _LOGGER.info("Synthesized %.1fs of audio (%d lipsync frames)", duration, len(lipsync_frames))

            except Exception:
                _LOGGER.exception("Synthesis error")