#!/bin/bash # Keep voice pipeline models warm in Ollama VRAM. # Runs as a loop — checks every 5 minutes, re-pins any model that got evicted. # Only pins lightweight/MoE models — large dense models (70B) use default expiry. OLLAMA_URL="http://localhost:11434" CHECK_INTERVAL=300 # seconds between checks # Medium model can be overridden via env var (e.g. by persona config) HOMEAI_MEDIUM_MODEL="${HOMEAI_MEDIUM_MODEL:-qwen3.5:35b-a3b}" # Models to keep warm: "name|description" MODELS=( "qwen2.5:7b|small (4.7GB) — fast fallback" "${HOMEAI_MEDIUM_MODEL}|medium — persona default" ) wait_for_ollama() { for i in $(seq 1 30); do curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && return 0 sleep 2 done return 1 } is_model_loaded() { local model="$1" curl -sf "$OLLAMA_URL/api/ps" 2>/dev/null \ | python3 -c " import json, sys data = json.load(sys.stdin) names = [m['name'] for m in data.get('models', [])] sys.exit(0 if '$model' in names else 1) " 2>/dev/null } pin_model() { local model="$1" local desc="$2" if is_model_loaded "$model"; then echo "[keepwarm] $model already loaded — skipping" return 0 fi echo "[keepwarm] Loading $model ($desc) with keep_alive=-1..." curl -sf "$OLLAMA_URL/api/generate" \ -d "{\"model\":\"$model\",\"prompt\":\"ready\",\"stream\":false,\"keep_alive\":-1,\"options\":{\"num_ctx\":512}}" \ > /dev/null 2>&1 if [ $? -eq 0 ]; then echo "[keepwarm] $model pinned in VRAM" else echo "[keepwarm] ERROR: failed to load $model" fi } # --- Main loop --- echo "[keepwarm] Starting model keep-warm daemon (interval: ${CHECK_INTERVAL}s)" # Initial wait for Ollama if ! wait_for_ollama; then echo "[keepwarm] ERROR: Ollama not reachable after 60s, exiting" exit 1 fi echo "[keepwarm] Ollama is online" while true; do for entry in "${MODELS[@]}"; do IFS='|' read -r model desc <<< "$entry" pin_model "$model" "$desc" done sleep "$CHECK_INTERVAL" done