homeai/homeai-voice/scripts/benchmark_pipeline.py

#!/usr/bin/env python3
"""
Voice Pipeline Benchmark

Measures latency of each stage independently:
  1. STT  (Wyoming Whisper, port 10300)
  2. LLM  (Ollama API, port 11434) — multiple models
  3. TTS  (Wyoming Kokoro, port 10301)
  4. End-to-end via OpenClaw HTTP Bridge (port 8081)

Usage:
    python benchmark_pipeline.py [--rounds 3] [--models llama3.3:70b,qwen3:32b]
"""

import argparse
import asyncio
import io
import json
import statistics
import sys
import time
import wave
from urllib.request import Request, urlopen
from urllib.error import URLError

from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.asr import Transcribe, Transcript
from wyoming.client import AsyncTcpClient
from wyoming.tts import Synthesize, SynthesizeVoice

# --- Config ---
STT_HOST, STT_PORT = "127.0.0.1", 10300
TTS_HOST, TTS_PORT = "127.0.0.1", 10301
OLLAMA_URL = "http://localhost:11434"
BRIDGE_URL = "http://localhost:8081"

TEST_PROMPTS = [
    "What is the capital of France?",
    "Turn on the living room lights.",
    "What's the weather like today?",
]

LONG_PROMPT = "Explain in two sentences how a heat pump works."


# --- Helpers ---

def http_post_json(url: str, data: dict, timeout: int = 180) -> tuple[dict, float]:
    """POST JSON, return (response_dict, elapsed_seconds)."""
    body = json.dumps(data).encode()
    req = Request(url, data=body, headers={"Content-Type": "application/json"})
    t0 = time.perf_counter()
    resp = urlopen(req, timeout=timeout)
    raw = resp.read()
    elapsed = time.perf_counter() - t0
    return json.loads(raw), elapsed


def http_post_raw(url: str, data: bytes, content_type: str, timeout: int = 180) -> tuple[bytes, float]:
    """POST raw bytes, return (response_bytes, elapsed_seconds)."""
    req = Request(url, data=data, headers={"Content-Type": content_type})
    t0 = time.perf_counter()
    resp = urlopen(req, timeout=timeout)
    raw = resp.read()
    elapsed = time.perf_counter() - t0
    return raw, elapsed


# --- Stage 1: TTS ---

async def benchmark_tts(text: str) -> tuple[bytes, float]:
    """Synthesize text via Wyoming TTS, return (wav_bytes, elapsed)."""
    t0 = time.perf_counter()
    client = AsyncTcpClient(TTS_HOST, TTS_PORT)
    await client.connect()
    await client.read_event()  # Info

    await client.write_event(
        Synthesize(text=text, voice=SynthesizeVoice(name="af_heart")).event()
    )

    audio_data = bytearray()
    rate, width, channels = 24000, 2, 1

    while True:
        event = await client.read_event()
        if event is None:
            break
        if AudioStart.is_type(event.type):
            start = AudioStart.from_event(event)
            rate, width, channels = start.rate, start.width, start.channels
        elif AudioChunk.is_type(event.type):
            audio_data.extend(AudioChunk.from_event(event).audio)
        elif AudioStop.is_type(event.type):
            break

    await client.disconnect()
    elapsed = time.perf_counter() - t0

    # Package as WAV
    wav_io = io.BytesIO()
    with wave.open(wav_io, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(width)
        wf.setframerate(rate)
        wf.writeframes(audio_data)

    duration_s = len(audio_data) / (rate * width * channels)
    return wav_io.getvalue(), elapsed, duration_s


# --- Stage 2: STT ---

async def benchmark_stt(wav_bytes: bytes) -> tuple[str, float]:
    """Transcribe WAV via Wyoming STT, return (text, elapsed)."""
    wav_io = io.BytesIO(wav_bytes)
    with wave.open(wav_io, "rb") as wf:
        rate = wf.getframerate()
        width = wf.getsampwidth()
        channels = wf.getnchannels()
        pcm = wf.readframes(wf.getnframes())

    t0 = time.perf_counter()
    client = AsyncTcpClient(STT_HOST, STT_PORT)
    await client.connect()

    await client.write_event(Transcribe(language="en").event())
    await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event())

    chunk_size = rate * width * channels  # 1 second
    for off in range(0, len(pcm), chunk_size):
        await client.write_event(
            AudioChunk(rate=rate, width=width, channels=channels, audio=pcm[off:off + chunk_size]).event()
        )
    await client.write_event(AudioStop().event())

    text = ""
    while True:
        event = await client.read_event()
        if event is None:
            break
        if Transcript.is_type(event.type):
            text = Transcript.from_event(event).text
            break

    await client.disconnect()
    elapsed = time.perf_counter() - t0
    return text, elapsed


# --- Stage 3: LLM ---

def benchmark_llm(model: str, prompt: str, warm: bool = False) -> dict:
    """
    Call Ollama /api/generate, return timing breakdown.
    If warm=True, we assume the model is already loaded.
    """
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"num_ctx": 2048},  # small ctx for benchmark speed
    }
    body = json.dumps(data).encode()
    req = Request(
        f"{OLLAMA_URL}/api/generate",
        data=body,
        headers={"Content-Type": "application/json"},
    )

    t0 = time.perf_counter()
    resp = urlopen(req, timeout=300)
    raw = resp.read()
    wall_time = time.perf_counter() - t0

    result = json.loads(raw)

    # Ollama returns timing in nanoseconds
    load_ns = result.get("load_duration", 0)
    prompt_ns = result.get("prompt_eval_duration", 0)
    eval_ns = result.get("eval_duration", 0)
    total_ns = result.get("total_duration", 0)
    prompt_tokens = result.get("prompt_eval_count", 0)
    eval_tokens = result.get("eval_count", 0)

    return {
        "model": model,
        "wall_time_s": wall_time,
        "load_s": load_ns / 1e9,
        "prompt_eval_s": prompt_ns / 1e9,
        "eval_s": eval_ns / 1e9,
        "total_s": total_ns / 1e9,
        "prompt_tokens": prompt_tokens,
        "eval_tokens": eval_tokens,
        "tokens_per_sec": eval_tokens / (eval_ns / 1e9) if eval_ns > 0 else 0,
        "prompt_tokens_per_sec": prompt_tokens / (prompt_ns / 1e9) if prompt_ns > 0 else 0,
        "response": result.get("response", "")[:200],
    }


def warm_model(model: str):
    """Send a tiny request to load the model into GPU memory."""
    print(f"  Warming up {model}...", end=" ", flush=True)
    try:
        data = json.dumps({"model": model, "prompt": "hi", "stream": False, "options": {"num_ctx": 512}}).encode()
        req = Request(f"{OLLAMA_URL}/api/generate", data=data, headers={"Content-Type": "application/json"})
        urlopen(req, timeout=300).read()
        print("ready")
    except Exception as e:
        print(f"warning: {e}")


# --- Stage 4: End-to-end via bridge ---

def benchmark_e2e(message: str) -> tuple[str, float]:
    """Call the OpenClaw HTTP bridge end-to-end."""
    data = {"message": message, "agent": "main"}
    resp, elapsed = http_post_json(f"{BRIDGE_URL}/api/agent/message", data, timeout=300)
    return resp.get("response", ""), elapsed


# --- Formatting ---

def fmt_time(seconds: float) -> str:
    if seconds < 1:
        return f"{seconds*1000:.0f}ms"
    return f"{seconds:.1f}s"


def print_table(rows: list[dict], columns: list[tuple[str, str, int]]):
    """Print a formatted table. columns = [(header, key, width), ...]"""
    header = " | ".join(h.ljust(w) for h, _, w in columns)
    print(header)
    print("-" * len(header))
    for row in rows:
        line = " | ".join(str(row.get(k, "")).ljust(w) for _, k, w in columns)
        print(line)


# --- Main ---

def main():
    parser = argparse.ArgumentParser(description="Voice Pipeline Benchmark")
    parser.add_argument("--rounds", type=int, default=2, help="Rounds per test (default: 2)")
    parser.add_argument(
        "--models",
        default="qwen2.5:7b,qwen3:32b,llama3.3:70b",
        help="Comma-separated Ollama models to test",
    )
    parser.add_argument("--skip-stt", action="store_true", help="Skip STT benchmark")
    parser.add_argument("--skip-tts", action="store_true", help="Skip TTS benchmark")
    parser.add_argument("--skip-llm", action="store_true", help="Skip LLM benchmark")
    parser.add_argument("--skip-e2e", action="store_true", help="Skip end-to-end benchmark")
    parser.add_argument("--prompt", default=None, help="Custom prompt for LLM benchmark")
    args = parser.parse_args()

    models = [m.strip() for m in args.models.split(",")]
    llm_prompt = args.prompt or LONG_PROMPT

    print("=" * 70)
    print("  VOICE PIPELINE BENCHMARK")
    print("=" * 70)
    print(f"  Rounds: {args.rounds}")
    print(f"  Models: {', '.join(models)}")
    print(f"  LLM prompt: {llm_prompt!r}")
    print()

    # ── TTS Benchmark ──
    test_wav = None
    if not args.skip_tts:
        print("── TTS (Kokoro, Wyoming port 10301) ──")
        tts_times = []
        tts_durations = []
        for i in range(args.rounds):
            text = TEST_PROMPTS[i % len(TEST_PROMPTS)]
            wav, elapsed, audio_dur = asyncio.run(benchmark_tts(text))
            tts_times.append(elapsed)
            tts_durations.append(audio_dur)
            test_wav = wav
            print(f"  Round {i+1}: {fmt_time(elapsed)} → {audio_dur:.1f}s audio  (RTF: {elapsed/audio_dur:.2f}x)  text={text!r}")

        avg_tts = statistics.mean(tts_times)
        avg_dur = statistics.mean(tts_durations)
        print(f"  Average: {fmt_time(avg_tts)} for {avg_dur:.1f}s audio (RTF: {avg_tts/avg_dur:.2f}x)")
        print()

    # ── STT Benchmark ──
    if not args.skip_stt:
        print("── STT (Whisper large-v3, Wyoming port 10300) ──")
        if test_wav is None:
            # Generate a test WAV first
            print("  Generating test audio via TTS...")
            test_wav, _, _ = asyncio.run(benchmark_tts("The quick brown fox jumps over the lazy dog."))

        stt_times = []
        for i in range(args.rounds):
            text, elapsed = asyncio.run(benchmark_stt(test_wav))
            stt_times.append(elapsed)
            print(f"  Round {i+1}: {fmt_time(elapsed)} → {text!r}")

        print(f"  Average: {fmt_time(statistics.mean(stt_times))}")
        print()

    # ── LLM Benchmark ──
    if not args.skip_llm:
        print("── LLM (Ollama) ──")
        print(f"  Prompt: {llm_prompt!r}")
        print()

        all_results = []
        for model in models:
            print(f"  Model: {model}")
            warm_model(model)

            model_runs = []
            for i in range(args.rounds):
                result = benchmark_llm(model, llm_prompt, warm=True)
                model_runs.append(result)
                print(
                    f"    Round {i+1}: wall={fmt_time(result['wall_time_s'])}  "
                    f"load={fmt_time(result['load_s'])}  "
                    f"prompt_eval={fmt_time(result['prompt_eval_s'])} ({result['prompt_tokens']}tok, {result['prompt_tokens_per_sec']:.0f}t/s)  "
                    f"gen={fmt_time(result['eval_s'])} ({result['eval_tokens']}tok, {result['tokens_per_sec']:.1f}t/s)"
                )
                # Truncate response for display
                resp_preview = result["response"][:100].replace("\n", " ")
                print(f"           → {resp_preview}")

            # Summarize
            avg_wall = statistics.mean(r["wall_time_s"] for r in model_runs)
            avg_tps = statistics.mean(r["tokens_per_sec"] for r in model_runs)
            avg_prompt_tps = statistics.mean(r["prompt_tokens_per_sec"] for r in model_runs)
            avg_tokens = statistics.mean(r["eval_tokens"] for r in model_runs)
            all_results.append({
                "model": model,
                "avg_wall": fmt_time(avg_wall),
                "avg_gen_tps": f"{avg_tps:.1f}",
                "avg_prompt_tps": f"{avg_prompt_tps:.0f}",
                "avg_tokens": f"{avg_tokens:.0f}",
            })
            print()

        # Summary table
        print("  ┌─ LLM Summary ─────────────────────────────────────────────┐")
        print(f"  {'Model':<25s} {'Wall time':>10s} {'Gen t/s':>10s} {'Prompt t/s':>11s} {'Avg tokens':>11s}")
        print(f"  {'─'*25} {'─'*10} {'─'*10} {'─'*11} {'─'*11}")
        for r in all_results:
            print(f"  {r['model']:<25s} {r['avg_wall']:>10s} {r['avg_gen_tps']:>10s} {r['avg_prompt_tps']:>11s} {r['avg_tokens']:>11s}")
        print()

    # ── End-to-end ──
    if not args.skip_e2e:
        print("── End-to-End (Bridge → OpenClaw → Ollama → response) ──")
        print("  (Does not include STT/TTS — just text in → text out via bridge)")
        e2e_prompt = "What time is it?"
        for i in range(args.rounds):
            try:
                resp, elapsed = benchmark_e2e(e2e_prompt)
                preview = resp[:100].replace("\n", " ")
                print(f"  Round {i+1}: {fmt_time(elapsed)} → {preview}")
            except Exception as e:
                print(f"  Round {i+1}: ERROR - {e}")
        print()

    # ── Pipeline estimate ──
    print("=" * 70)
    print("  ESTIMATED PIPELINE LATENCY (per voice interaction)")
    print("=" * 70)
    print("  wake word detection  ~instant (runs locally)")
    print("  + STT (Whisper)      see above")
    print("  + LLM (inference)    see above (dominant cost)")
    print("  + TTS (Kokoro)       see above")
    print("  ─────────────────────────────────────")
    print("  Tip: smaller models (7B, 32B) dramatically reduce LLM latency.")
    print("  The 70B model at ~12 tok/s needs ~5-8s for a typical reply.")
    print("  A 7B model at ~80 tok/s would need <1s for the same reply.")
    print()


if __name__ == "__main__":
    main()