feat: character system v2 — schema upgrade, memory system, per-character TTS routing

Character schema v2: background, dialogue_style, appearance, skills, gaze_presets with automatic v1→v2 migration. LLM-assisted character creation via Character MCP server. Two-tier memory system (personal per-character + general shared) with budget-based injection into LLM system prompt. Per-character TTS voice routing via state file — Wyoming TTS server reads active config to route between Kokoro (local) and ElevenLabs (cloud PCM 24kHz). Dashboard: memories page, conversation history, character profile on cards, auto-TTS engine selection from character config. Also includes VTube Studio expression bridge and ComfyUI API guide. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 19:15:46 +00:00
parent 1e52c002c2
commit 60eb89ea42
39 changed files with 3846 additions and 409 deletions
--- a/homeai-llm/scripts/preload-models.sh
+++ b/homeai-llm/scripts/preload-models.sh
@@ -1,19 +1,73 @@
 #!/bin/bash
-# Pre-load voice pipeline models into Ollama with infinite keep_alive.
-# Run after Ollama starts (called by launchd or manually).
+# Keep voice pipeline models warm in Ollama VRAM.
+# Runs as a loop — checks every 5 minutes, re-pins any model that got evicted.
 # Only pins lightweight/MoE models — large dense models (70B) use default expiry.

 OLLAMA_URL="http://localhost:11434"
+CHECK_INTERVAL=300  # seconds between checks

-# Wait for Ollama to be ready
-for i in $(seq 1 30); do
-    curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
-    sleep 2
+# Medium model can be overridden via env var (e.g. by persona config)
+HOMEAI_MEDIUM_MODEL="${HOMEAI_MEDIUM_MODEL:-qwen3.5:35b-a3b}"
+
+# Models to keep warm: "name|description"
+MODELS=(
+    "qwen2.5:7b|small (4.7GB) — fast fallback"
+    "${HOMEAI_MEDIUM_MODEL}|medium — persona default"
+)
+
+wait_for_ollama() {
+    for i in $(seq 1 30); do
+        curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && return 0
+        sleep 2
+    done
+    return 1
+}
+
+is_model_loaded() {
+    local model="$1"
+    curl -sf "$OLLAMA_URL/api/ps" 2>/dev/null \
+        | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+names = [m['name'] for m in data.get('models', [])]
+sys.exit(0 if '$model' in names else 1)
+" 2>/dev/null
+}
+
+pin_model() {
+    local model="$1"
+    local desc="$2"
+    if is_model_loaded "$model"; then
+        echo "[keepwarm] $model already loaded — skipping"
+        return 0
+    fi
+    echo "[keepwarm] Loading $model ($desc) with keep_alive=-1..."
+    curl -sf "$OLLAMA_URL/api/generate" \
+        -d "{\"model\":\"$model\",\"prompt\":\"ready\",\"stream\":false,\"keep_alive\":-1,\"options\":{\"num_ctx\":512}}" \
+        > /dev/null 2>&1
+    if [ $? -eq 0 ]; then
+        echo "[keepwarm] $model pinned in VRAM"
+    else
+        echo "[keepwarm] ERROR: failed to load $model"
+    fi
+}
+
+# --- Main loop ---
+
+echo "[keepwarm] Starting model keep-warm daemon (interval: ${CHECK_INTERVAL}s)"
+
+# Initial wait for Ollama
+if ! wait_for_ollama; then
+    echo "[keepwarm] ERROR: Ollama not reachable after 60s, exiting"
+    exit 1
+fi
+echo "[keepwarm] Ollama is online"
+
+while true; do
+    for entry in "${MODELS[@]}"; do
+        IFS='|' read -r model desc <<< "$entry"
+        pin_model "$model" "$desc"
+    done
+
+    sleep "$CHECK_INTERVAL"
 done
-
-# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
-echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
-curl -sf "$OLLAMA_URL/api/generate" \
-    -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
-    > /dev/null 2>&1
-echo "[preload] qwen3.5:35b-a3b pinned in memory"