#!/bin/bash # Pre-load voice pipeline models into Ollama with infinite keep_alive. # Run after Ollama starts (called by launchd or manually). # Only pins lightweight/MoE models — large dense models (70B) use default expiry. OLLAMA_URL="http://localhost:11434" # Wait for Ollama to be ready for i in $(seq 1 30); do curl -sf "$OLLAMA_URL/api/tags" > /dev/null 2>&1 && break sleep 2 done # Pin qwen3.5:35b-a3b (MoE, 38.7GB VRAM, voice pipeline default) echo "[preload] Loading qwen3.5:35b-a3b with keep_alive=-1..." curl -sf "$OLLAMA_URL/api/generate" \ -d '{"model":"qwen3.5:35b-a3b","prompt":"ready","stream":false,"keep_alive":-1,"options":{"num_ctx":512}}' \ > /dev/null 2>&1 echo "[preload] qwen3.5:35b-a3b pinned in memory"