#!/bin/bash
# Pre-load voice pipeline models into Ollama with infinite keep_alive.
# Run after Ollama starts (called by launchd or manually).
# Only pins lightweight/MoE models — large dense models (70B) use default expiry.

OLLAMA_URL="http://localhost:11434"

# Wait for Ollama to be ready
for i in $(seq 1 30); do
    curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break
    sleep 2
done

# Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default)
echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..."
curl -sf "$OLLAMA_URL/api/generate" \
    -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \
    > /dev/null 2>&1
echo "[preload] qwen3.5:35b-a3b pinned in memory"