feat: character system v2 — schema upgrade, memory system, per-character TTS routing
Character schema v2: background, dialogue_style, appearance, skills, gaze_presets with automatic v1→v2 migration. LLM-assisted character creation via Character MCP server. Two-tier memory system (personal per-character + general shared) with budget-based injection into LLM system prompt. Per-character TTS voice routing via state file — Wyoming TTS server reads active config to route between Kokoro (local) and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history, character profile on cards, auto-TTS engine selection from character config. Also includes VTube Studio expression bridge and ComfyUI API guide. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,9 +24,12 @@ Endpoints:
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
import urllib.request
|
||||
import threading
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from socketserver import ThreadingMixIn
|
||||
from urllib.parse import urlparse
|
||||
@@ -40,19 +43,222 @@ from wyoming.asr import Transcribe, Transcript
|
||||
from wyoming.audio import AudioStart, AudioChunk, AudioStop
|
||||
from wyoming.info import Info
|
||||
|
||||
# Timeout settings (seconds)
|
||||
TIMEOUT_WARM = 120 # Model already loaded in VRAM
|
||||
TIMEOUT_COLD = 180 # Model needs loading first (~10-20s load + inference)
|
||||
OLLAMA_PS_URL = "http://localhost:11434/api/ps"
|
||||
VTUBE_BRIDGE_URL = "http://localhost:8002"
|
||||
|
||||
def load_character_prompt() -> str:
|
||||
"""Load the active character system prompt."""
|
||||
character_path = Path.home() / ".openclaw" / "characters" / "aria.json"
|
||||
|
||||
def _vtube_fire_and_forget(path: str, data: dict):
|
||||
"""Send a non-blocking POST to the VTube Studio bridge. Failures are silent."""
|
||||
def _post():
|
||||
try:
|
||||
body = json.dumps(data).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{VTUBE_BRIDGE_URL}{path}",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
urllib.request.urlopen(req, timeout=2)
|
||||
except Exception:
|
||||
pass # bridge may not be running — that's fine
|
||||
threading.Thread(target=_post, daemon=True).start()
|
||||
|
||||
|
||||
def is_model_warm() -> bool:
|
||||
"""Check if the default Ollama model is already loaded in VRAM."""
|
||||
try:
|
||||
req = urllib.request.Request(OLLAMA_PS_URL)
|
||||
with urllib.request.urlopen(req, timeout=2) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return len(data.get("models", [])) > 0
|
||||
except Exception:
|
||||
# If we can't reach Ollama, assume cold (safer longer timeout)
|
||||
return False
|
||||
|
||||
|
||||
CHARACTERS_DIR = Path("/Users/aodhan/homeai-data/characters")
|
||||
SATELLITE_MAP_PATH = Path("/Users/aodhan/homeai-data/satellite-map.json")
|
||||
MEMORIES_DIR = Path("/Users/aodhan/homeai-data/memories")
|
||||
ACTIVE_TTS_VOICE_PATH = Path("/Users/aodhan/homeai-data/active-tts-voice.json")
|
||||
|
||||
|
||||
def clean_text_for_tts(text: str) -> str:
|
||||
"""Strip content that shouldn't be spoken: tags, asterisks, emojis, markdown."""
|
||||
# Remove HTML/XML tags and their content for common non-spoken tags
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
# Remove content between asterisks (actions/emphasis markup like *sighs*)
|
||||
text = re.sub(r'\*[^*]+\*', '', text)
|
||||
# Remove markdown bold/italic markers that might remain
|
||||
text = re.sub(r'[*_]{1,3}', '', text)
|
||||
# Remove markdown headers
|
||||
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
||||
# Remove markdown links [text](url) → keep text
|
||||
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
||||
# Remove bare URLs
|
||||
text = re.sub(r'https?://\S+', '', text)
|
||||
# Remove code blocks and inline code
|
||||
text = re.sub(r'```[\s\S]*?```', '', text)
|
||||
text = re.sub(r'`[^`]+`', '', text)
|
||||
# Remove emojis
|
||||
text = re.sub(
|
||||
r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF'
|
||||
r'\U0001F1E0-\U0001F1FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FAFF'
|
||||
r'\U00002702-\U000027B0\U0000FE00-\U0000FE0F\U0000200D'
|
||||
r'\U00002600-\U000026FF\U00002300-\U000023FF]+', '', text
|
||||
)
|
||||
# Collapse multiple spaces/newlines
|
||||
text = re.sub(r'\n{2,}', '\n', text)
|
||||
text = re.sub(r'[ \t]{2,}', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def load_satellite_map() -> dict:
|
||||
"""Load the satellite-to-character mapping."""
|
||||
try:
|
||||
with open(SATELLITE_MAP_PATH) as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {"default": "aria_default", "satellites": {}}
|
||||
|
||||
|
||||
def set_active_tts_voice(character_id: str, tts_config: dict):
|
||||
"""Write the active TTS config to a state file for the Wyoming TTS server to read."""
|
||||
try:
|
||||
ACTIVE_TTS_VOICE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
state = {
|
||||
"character_id": character_id,
|
||||
"engine": tts_config.get("engine", "kokoro"),
|
||||
"kokoro_voice": tts_config.get("kokoro_voice", ""),
|
||||
"elevenlabs_voice_id": tts_config.get("elevenlabs_voice_id", ""),
|
||||
"elevenlabs_model": tts_config.get("elevenlabs_model", "eleven_multilingual_v2"),
|
||||
"speed": tts_config.get("speed", 1),
|
||||
}
|
||||
with open(ACTIVE_TTS_VOICE_PATH, "w") as f:
|
||||
json.dump(state, f)
|
||||
except Exception as e:
|
||||
print(f"[OpenClaw Bridge] Warning: could not write active TTS config: {e}")
|
||||
|
||||
|
||||
def resolve_character_id(satellite_id: str = None) -> str:
|
||||
"""Resolve a satellite ID to a character profile ID."""
|
||||
sat_map = load_satellite_map()
|
||||
if satellite_id and satellite_id in sat_map.get("satellites", {}):
|
||||
return sat_map["satellites"][satellite_id]
|
||||
return sat_map.get("default", "aria_default")
|
||||
|
||||
|
||||
def load_character(character_id: str = None) -> dict:
|
||||
"""Load a character profile by ID. Returns the full character data dict."""
|
||||
if not character_id:
|
||||
character_id = resolve_character_id()
|
||||
safe_id = character_id.replace("/", "_")
|
||||
character_path = CHARACTERS_DIR / f"{safe_id}.json"
|
||||
if not character_path.exists():
|
||||
return ""
|
||||
return {}
|
||||
try:
|
||||
with open(character_path) as f:
|
||||
data = json.load(f)
|
||||
return data.get("system_prompt", "")
|
||||
profile = json.load(f)
|
||||
return profile.get("data", {})
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def load_character_prompt(satellite_id: str = None, character_id: str = None) -> str:
|
||||
"""Load the full system prompt for a character, resolved by satellite or explicit ID.
|
||||
Builds a rich prompt from system_prompt + profile fields (background, dialogue_style, etc.)."""
|
||||
if not character_id:
|
||||
character_id = resolve_character_id(satellite_id)
|
||||
char = load_character(character_id)
|
||||
if not char:
|
||||
return ""
|
||||
|
||||
sections = []
|
||||
|
||||
# Core system prompt
|
||||
prompt = char.get("system_prompt", "")
|
||||
if prompt:
|
||||
sections.append(prompt)
|
||||
|
||||
# Character profile fields
|
||||
profile_parts = []
|
||||
if char.get("background"):
|
||||
profile_parts.append(f"## Background\n{char['background']}")
|
||||
if char.get("appearance"):
|
||||
profile_parts.append(f"## Appearance\n{char['appearance']}")
|
||||
if char.get("dialogue_style"):
|
||||
profile_parts.append(f"## Dialogue Style\n{char['dialogue_style']}")
|
||||
if char.get("skills"):
|
||||
skills = char["skills"]
|
||||
if isinstance(skills, list):
|
||||
skills_text = ", ".join(skills[:15])
|
||||
else:
|
||||
skills_text = str(skills)
|
||||
profile_parts.append(f"## Skills & Interests\n{skills_text}")
|
||||
if profile_parts:
|
||||
sections.append("[Character Profile]\n" + "\n\n".join(profile_parts))
|
||||
|
||||
# Character metadata
|
||||
meta_lines = []
|
||||
if char.get("display_name"):
|
||||
meta_lines.append(f"Your name is: {char['display_name']}")
|
||||
# Support both v1 (gaze_preset string) and v2 (gaze_presets array)
|
||||
gaze_presets = char.get("gaze_presets", [])
|
||||
if gaze_presets and isinstance(gaze_presets, list):
|
||||
for gp in gaze_presets:
|
||||
preset = gp.get("preset", "")
|
||||
trigger = gp.get("trigger", "self-portrait")
|
||||
if preset:
|
||||
meta_lines.append(f"GAZE preset '{preset}' — use for: {trigger}")
|
||||
elif char.get("gaze_preset"):
|
||||
meta_lines.append(f"Your gaze_preset for self-portraits is: {char['gaze_preset']}")
|
||||
if meta_lines:
|
||||
sections.append("[Character Metadata]\n" + "\n".join(meta_lines))
|
||||
|
||||
# Memories (personal + general)
|
||||
personal, general = load_memories(character_id)
|
||||
if personal:
|
||||
sections.append("[Personal Memories]\n" + "\n".join(f"- {m}" for m in personal))
|
||||
if general:
|
||||
sections.append("[General Knowledge]\n" + "\n".join(f"- {m}" for m in general))
|
||||
|
||||
return "\n\n".join(sections)
|
||||
|
||||
|
||||
def load_memories(character_id: str) -> tuple[list[str], list[str]]:
|
||||
"""Load personal (per-character) and general memories.
|
||||
Returns (personal_contents, general_contents) truncated to fit context budget."""
|
||||
PERSONAL_BUDGET = 4000 # max chars for personal memories in prompt
|
||||
GENERAL_BUDGET = 3000 # max chars for general memories in prompt
|
||||
|
||||
def _read_memories(path: Path, budget: int) -> list[str]:
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except Exception:
|
||||
return []
|
||||
memories = data.get("memories", [])
|
||||
# Sort newest first
|
||||
memories.sort(key=lambda m: m.get("createdAt", ""), reverse=True)
|
||||
result = []
|
||||
used = 0
|
||||
for m in memories:
|
||||
content = m.get("content", "").strip()
|
||||
if not content:
|
||||
continue
|
||||
if used + len(content) > budget:
|
||||
break
|
||||
result.append(content)
|
||||
used += len(content)
|
||||
return result
|
||||
|
||||
safe_id = character_id.replace("/", "_")
|
||||
personal = _read_memories(MEMORIES_DIR / "personal" / f"{safe_id}.json", PERSONAL_BUDGET)
|
||||
general = _read_memories(MEMORIES_DIR / "general.json", GENERAL_BUDGET)
|
||||
return personal, general
|
||||
|
||||
|
||||
class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
"""HTTP request handler for OpenClaw bridge."""
|
||||
@@ -95,44 +301,78 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
self._send_json_response(404, {"error": "Not found"})
|
||||
|
||||
def _handle_tts_request(self):
|
||||
"""Handle TTS request and return wav audio."""
|
||||
"""Handle TTS request and return audio. Routes to Kokoro or ElevenLabs based on engine."""
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
if content_length == 0:
|
||||
self._send_json_response(400, {"error": "Empty body"})
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
body = self.rfile.read(content_length).decode()
|
||||
data = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
self._send_json_response(400, {"error": "Invalid JSON"})
|
||||
return
|
||||
|
||||
|
||||
text = data.get("text", "Hello, this is a test.")
|
||||
# Strip emojis so TTS doesn't try to read them out
|
||||
text = re.sub(
|
||||
r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF'
|
||||
r'\U0001F1E0-\U0001F1FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FAFF'
|
||||
r'\U00002702-\U000027B0\U0000FE00-\U0000FE0F\U0000200D'
|
||||
r'\U00002600-\U000026FF\U00002300-\U000023FF]+', '', text
|
||||
).strip()
|
||||
text = clean_text_for_tts(text)
|
||||
voice = data.get("voice", "af_heart")
|
||||
|
||||
engine = data.get("engine", "kokoro")
|
||||
|
||||
try:
|
||||
# Run the async Wyoming client
|
||||
audio_bytes = asyncio.run(self._synthesize_audio(text, voice))
|
||||
|
||||
# Send WAV response
|
||||
# Signal avatar: speaking
|
||||
_vtube_fire_and_forget("/expression", {"event": "speaking"})
|
||||
|
||||
if engine == "elevenlabs":
|
||||
audio_bytes, content_type = self._synthesize_elevenlabs(text, voice, data.get("model"))
|
||||
else:
|
||||
# Default: local Kokoro via Wyoming
|
||||
audio_bytes = asyncio.run(self._synthesize_audio(text, voice))
|
||||
content_type = "audio/wav"
|
||||
|
||||
# Signal avatar: idle
|
||||
_vtube_fire_and_forget("/expression", {"event": "idle"})
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "audio/wav")
|
||||
# Allow CORS for local testing from Vite
|
||||
self.send_header("Content-Type", content_type)
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.end_headers()
|
||||
self.wfile.write(audio_bytes)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
_vtube_fire_and_forget("/expression", {"event": "error"})
|
||||
self._send_json_response(500, {"error": str(e)})
|
||||
|
||||
def _synthesize_elevenlabs(self, text: str, voice_id: str, model: str = None) -> tuple[bytes, str]:
|
||||
"""Call ElevenLabs TTS API and return (audio_bytes, content_type)."""
|
||||
api_key = os.environ.get("ELEVENLABS_API_KEY", "")
|
||||
if not api_key:
|
||||
raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
|
||||
if not voice_id:
|
||||
raise RuntimeError("No ElevenLabs voice ID provided")
|
||||
|
||||
model = model or "eleven_multilingual_v2"
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||
payload = json.dumps({
|
||||
"text": text,
|
||||
"model_id": model,
|
||||
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=payload,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": api_key,
|
||||
"Accept": "audio/mpeg",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
audio_bytes = resp.read()
|
||||
return audio_bytes, "audio/mpeg"
|
||||
|
||||
def do_OPTIONS(self):
|
||||
"""Handle CORS preflight requests."""
|
||||
self.send_response(204)
|
||||
@@ -264,6 +504,43 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
print(f"[OpenClaw Bridge] Wake word detected: {wake_word_data.get('wake_word', 'unknown')}")
|
||||
self._send_json_response(200, {"status": "ok", "message": "Wake word received"})
|
||||
|
||||
@staticmethod
|
||||
def _call_openclaw(message: str, agent: str, timeout: int) -> str:
|
||||
"""Call OpenClaw CLI and return stdout."""
|
||||
result = subprocess.run(
|
||||
["/opt/homebrew/bin/openclaw", "agent", "--message", message, "--agent", agent],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
@staticmethod
|
||||
def _needs_followup(response: str) -> bool:
|
||||
"""Detect if the model promised to act but didn't actually do it.
|
||||
Returns True if the response looks like a 'will do' without a result."""
|
||||
if not response:
|
||||
return False
|
||||
resp_lower = response.lower()
|
||||
# If the response contains a URL or JSON-like output, it probably completed
|
||||
if "http://" in response or "https://" in response or '"status"' in response:
|
||||
return False
|
||||
# If it contains a tool result indicator (ha-ctl output, gaze-ctl output)
|
||||
if any(kw in resp_lower for kw in ["image_url", "seed", "entity_id", "state:", "turned on", "turned off"]):
|
||||
return False
|
||||
# Detect promise-like language without substance
|
||||
promise_phrases = [
|
||||
"let me", "i'll ", "i will ", "sure thing", "sure,", "right away",
|
||||
"generating", "one moment", "working on", "hang on", "just a moment",
|
||||
"on it", "let me generate", "let me create",
|
||||
]
|
||||
has_promise = any(phrase in resp_lower for phrase in promise_phrases)
|
||||
# Short responses with promise language are likely incomplete
|
||||
if has_promise and len(response) < 200:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _handle_agent_request(self):
|
||||
"""Handle agent message request."""
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
@@ -280,29 +557,63 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
|
||||
|
||||
message = data.get("message")
|
||||
agent = data.get("agent", "main")
|
||||
satellite_id = data.get("satellite_id")
|
||||
explicit_character_id = data.get("character_id")
|
||||
|
||||
if not message:
|
||||
self._send_json_response(400, {"error": "Message is required"})
|
||||
return
|
||||
|
||||
# Inject system prompt
|
||||
system_prompt = load_character_prompt()
|
||||
# Resolve character: explicit ID > satellite mapping > default
|
||||
if explicit_character_id:
|
||||
character_id = explicit_character_id
|
||||
else:
|
||||
character_id = resolve_character_id(satellite_id)
|
||||
system_prompt = load_character_prompt(character_id=character_id)
|
||||
|
||||
# Set the active TTS config for the Wyoming server to pick up
|
||||
char = load_character(character_id)
|
||||
tts_config = char.get("tts", {})
|
||||
if tts_config:
|
||||
set_active_tts_voice(character_id, tts_config)
|
||||
engine = tts_config.get("engine", "kokoro")
|
||||
voice_label = tts_config.get("kokoro_voice", "") if engine == "kokoro" else tts_config.get("elevenlabs_voice_id", "")
|
||||
print(f"[OpenClaw Bridge] Active TTS: {engine} / {voice_label}")
|
||||
|
||||
if satellite_id:
|
||||
print(f"[OpenClaw Bridge] Satellite: {satellite_id} → character: {character_id}")
|
||||
elif explicit_character_id:
|
||||
print(f"[OpenClaw Bridge] Character: {character_id}")
|
||||
if system_prompt:
|
||||
message = f"System Context: {system_prompt}\n\nUser Request: {message}"
|
||||
|
||||
# Check if model is warm to set appropriate timeout
|
||||
warm = is_model_warm()
|
||||
timeout = TIMEOUT_WARM if warm else TIMEOUT_COLD
|
||||
print(f"[OpenClaw Bridge] Model {'warm' if warm else 'cold'}, timeout={timeout}s")
|
||||
|
||||
# Signal avatar: thinking
|
||||
_vtube_fire_and_forget("/expression", {"event": "thinking"})
|
||||
|
||||
# Call OpenClaw CLI (use full path for launchd compatibility)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["/opt/homebrew/bin/openclaw", "agent", "--message", message, "--agent", agent],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
check=True
|
||||
)
|
||||
response_text = result.stdout.strip()
|
||||
response_text = self._call_openclaw(message, agent, timeout)
|
||||
|
||||
# Re-prompt if the model promised to act but didn't call a tool.
|
||||
# Detect "I'll do X" / "Let me X" responses that lack any result.
|
||||
if self._needs_followup(response_text):
|
||||
print(f"[OpenClaw Bridge] Response looks like a promise without action, re-prompting")
|
||||
followup = (
|
||||
"You just said you would do something but didn't actually call the exec tool. "
|
||||
"Do NOT explain what you will do — call the tool NOW using exec and return the result."
|
||||
)
|
||||
response_text = self._call_openclaw(followup, agent, timeout)
|
||||
|
||||
# Signal avatar: idle (TTS handler will override to 'speaking' if voice is used)
|
||||
_vtube_fire_and_forget("/expression", {"event": "idle"})
|
||||
self._send_json_response(200, {"response": response_text})
|
||||
except subprocess.TimeoutExpired:
|
||||
self._send_json_response(504, {"error": "OpenClaw command timed out"})
|
||||
self._send_json_response(504, {"error": f"OpenClaw command timed out after {timeout}s (model was {'warm' if warm else 'cold'})"})
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_msg = e.stderr.strip() if e.stderr else "OpenClaw command failed"
|
||||
self._send_json_response(500, {"error": error_msg})
|
||||
|
||||
Reference in New Issue
Block a user