#!/usr/bin/env python3 """ Voice Pipeline Benchmark Measures latency of each stage independently: 1. STT (Wyoming Whisper, port 10300) 2. LLM (Ollama API, port 11434) — multiple models 3. TTS (Wyoming Kokoro, port 10301) 4. End-to-end via OpenClaw HTTP Bridge (port 8081) Usage: python benchmark_pipeline.py [--rounds 3] [--models llama3.3:70b,qwen3:32b] """ import argparse import asyncio import io import json import statistics import sys import time import wave from urllib.request import Request, urlopen from urllib.error import URLError from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.asr import Transcribe, Transcript from wyoming.client import AsyncTcpClient from wyoming.tts import Synthesize, SynthesizeVoice # --- Config --- STT_HOST, STT_PORT = "127.0.0.1", 10300 TTS_HOST, TTS_PORT = "127.0.0.1", 10301 OLLAMA_URL = "http://localhost:11434" BRIDGE_URL = "http://localhost:8081" TEST_PROMPTS = [ "What is the capital of France?", "Turn on the living room lights.", "What's the weather like today?", ] LONG_PROMPT = "Explain in two sentences how a heat pump works." # --- Helpers --- def http_post_json(url: str, data: dict, timeout: int = 180) -> tuple[dict, float]: """POST JSON, return (response_dict, elapsed_seconds).""" body = json.dumps(data).encode() req = Request(url, data=body, headers={"Content-Type": "application/json"}) t0 = time.perf_counter() resp = urlopen(req, timeout=timeout) raw = resp.read() elapsed = time.perf_counter() - t0 return json.loads(raw), elapsed def http_post_raw(url: str, data: bytes, content_type: str, timeout: int = 180) -> tuple[bytes, float]: """POST raw bytes, return (response_bytes, elapsed_seconds).""" req = Request(url, data=data, headers={"Content-Type": content_type}) t0 = time.perf_counter() resp = urlopen(req, timeout=timeout) raw = resp.read() elapsed = time.perf_counter() - t0 return raw, elapsed # --- Stage 1: TTS --- async def benchmark_tts(text: str) -> tuple[bytes, float]: """Synthesize text via Wyoming TTS, return (wav_bytes, elapsed).""" t0 = time.perf_counter() client = AsyncTcpClient(TTS_HOST, TTS_PORT) await client.connect() await client.read_event() # Info await client.write_event( Synthesize(text=text, voice=SynthesizeVoice(name="af_heart")).event() ) audio_data = bytearray() rate, width, channels = 24000, 2, 1 while True: event = await client.read_event() if event is None: break if AudioStart.is_type(event.type): start = AudioStart.from_event(event) rate, width, channels = start.rate, start.width, start.channels elif AudioChunk.is_type(event.type): audio_data.extend(AudioChunk.from_event(event).audio) elif AudioStop.is_type(event.type): break await client.disconnect() elapsed = time.perf_counter() - t0 # Package as WAV wav_io = io.BytesIO() with wave.open(wav_io, "wb") as wf: wf.setnchannels(channels) wf.setsampwidth(width) wf.setframerate(rate) wf.writeframes(audio_data) duration_s = len(audio_data) / (rate * width * channels) return wav_io.getvalue(), elapsed, duration_s # --- Stage 2: STT --- async def benchmark_stt(wav_bytes: bytes) -> tuple[str, float]: """Transcribe WAV via Wyoming STT, return (text, elapsed).""" wav_io = io.BytesIO(wav_bytes) with wave.open(wav_io, "rb") as wf: rate = wf.getframerate() width = wf.getsampwidth() channels = wf.getnchannels() pcm = wf.readframes(wf.getnframes()) t0 = time.perf_counter() client = AsyncTcpClient(STT_HOST, STT_PORT) await client.connect() await client.write_event(Transcribe(language="en").event()) await client.write_event(AudioStart(rate=rate, width=width, channels=channels).event()) chunk_size = rate * width * channels # 1 second for off in range(0, len(pcm), chunk_size): await client.write_event( AudioChunk(rate=rate, width=width, channels=channels, audio=pcm[off:off + chunk_size]).event() ) await client.write_event(AudioStop().event()) text = "" while True: event = await client.read_event() if event is None: break if Transcript.is_type(event.type): text = Transcript.from_event(event).text break await client.disconnect() elapsed = time.perf_counter() - t0 return text, elapsed # --- Stage 3: LLM --- def benchmark_llm(model: str, prompt: str, warm: bool = False) -> dict: """ Call Ollama /api/generate, return timing breakdown. If warm=True, we assume the model is already loaded. """ data = { "model": model, "prompt": prompt, "stream": False, "options": {"num_ctx": 2048}, # small ctx for benchmark speed } body = json.dumps(data).encode() req = Request( f"{OLLAMA_URL}/api/generate", data=body, headers={"Content-Type": "application/json"}, ) t0 = time.perf_counter() resp = urlopen(req, timeout=300) raw = resp.read() wall_time = time.perf_counter() - t0 result = json.loads(raw) # Ollama returns timing in nanoseconds load_ns = result.get("load_duration", 0) prompt_ns = result.get("prompt_eval_duration", 0) eval_ns = result.get("eval_duration", 0) total_ns = result.get("total_duration", 0) prompt_tokens = result.get("prompt_eval_count", 0) eval_tokens = result.get("eval_count", 0) return { "model": model, "wall_time_s": wall_time, "load_s": load_ns / 1e9, "prompt_eval_s": prompt_ns / 1e9, "eval_s": eval_ns / 1e9, "total_s": total_ns / 1e9, "prompt_tokens": prompt_tokens, "eval_tokens": eval_tokens, "tokens_per_sec": eval_tokens / (eval_ns / 1e9) if eval_ns > 0 else 0, "prompt_tokens_per_sec": prompt_tokens / (prompt_ns / 1e9) if prompt_ns > 0 else 0, "response": result.get("response", "")[:200], } def warm_model(model: str): """Send a tiny request to load the model into GPU memory.""" print(f" Warming up {model}...", end=" ", flush=True) try: data = json.dumps({"model": model, "prompt": "hi", "stream": False, "options": {"num_ctx": 512}}).encode() req = Request(f"{OLLAMA_URL}/api/generate", data=data, headers={"Content-Type": "application/json"}) urlopen(req, timeout=300).read() print("ready") except Exception as e: print(f"warning: {e}") # --- Stage 4: End-to-end via bridge --- def benchmark_e2e(message: str) -> tuple[str, float]: """Call the OpenClaw HTTP bridge end-to-end.""" data = {"message": message, "agent": "main"} resp, elapsed = http_post_json(f"{BRIDGE_URL}/api/agent/message", data, timeout=300) return resp.get("response", ""), elapsed # --- Formatting --- def fmt_time(seconds: float) -> str: if seconds < 1: return f"{seconds*1000:.0f}ms" return f"{seconds:.1f}s" def print_table(rows: list[dict], columns: list[tuple[str, str, int]]): """Print a formatted table. columns = [(header, key, width), ...]""" header = " | ".join(h.ljust(w) for h, _, w in columns) print(header) print("-" * len(header)) for row in rows: line = " | ".join(str(row.get(k, "")).ljust(w) for _, k, w in columns) print(line) # --- Main --- def main(): parser = argparse.ArgumentParser(description="Voice Pipeline Benchmark") parser.add_argument("--rounds", type=int, default=2, help="Rounds per test (default: 2)") parser.add_argument( "--models", default="qwen2.5:7b,qwen3:32b,llama3.3:70b", help="Comma-separated Ollama models to test", ) parser.add_argument("--skip-stt", action="store_true", help="Skip STT benchmark") parser.add_argument("--skip-tts", action="store_true", help="Skip TTS benchmark") parser.add_argument("--skip-llm", action="store_true", help="Skip LLM benchmark") parser.add_argument("--skip-e2e", action="store_true", help="Skip end-to-end benchmark") parser.add_argument("--prompt", default=None, help="Custom prompt for LLM benchmark") args = parser.parse_args() models = [m.strip() for m in args.models.split(",")] llm_prompt = args.prompt or LONG_PROMPT print("=" * 70) print(" VOICE PIPELINE BENCHMARK") print("=" * 70) print(f" Rounds: {args.rounds}") print(f" Models: {', '.join(models)}") print(f" LLM prompt: {llm_prompt!r}") print() # ── TTS Benchmark ── test_wav = None if not args.skip_tts: print("── TTS (Kokoro, Wyoming port 10301) ──") tts_times = [] tts_durations = [] for i in range(args.rounds): text = TEST_PROMPTS[i % len(TEST_PROMPTS)] wav, elapsed, audio_dur = asyncio.run(benchmark_tts(text)) tts_times.append(elapsed) tts_durations.append(audio_dur) test_wav = wav print(f" Round {i+1}: {fmt_time(elapsed)} → {audio_dur:.1f}s audio (RTF: {elapsed/audio_dur:.2f}x) text={text!r}") avg_tts = statistics.mean(tts_times) avg_dur = statistics.mean(tts_durations) print(f" Average: {fmt_time(avg_tts)} for {avg_dur:.1f}s audio (RTF: {avg_tts/avg_dur:.2f}x)") print() # ── STT Benchmark ── if not args.skip_stt: print("── STT (Whisper large-v3, Wyoming port 10300) ──") if test_wav is None: # Generate a test WAV first print(" Generating test audio via TTS...") test_wav, _, _ = asyncio.run(benchmark_tts("The quick brown fox jumps over the lazy dog.")) stt_times = [] for i in range(args.rounds): text, elapsed = asyncio.run(benchmark_stt(test_wav)) stt_times.append(elapsed) print(f" Round {i+1}: {fmt_time(elapsed)} → {text!r}") print(f" Average: {fmt_time(statistics.mean(stt_times))}") print() # ── LLM Benchmark ── if not args.skip_llm: print("── LLM (Ollama) ──") print(f" Prompt: {llm_prompt!r}") print() all_results = [] for model in models: print(f" Model: {model}") warm_model(model) model_runs = [] for i in range(args.rounds): result = benchmark_llm(model, llm_prompt, warm=True) model_runs.append(result) print( f" Round {i+1}: wall={fmt_time(result['wall_time_s'])} " f"load={fmt_time(result['load_s'])} " f"prompt_eval={fmt_time(result['prompt_eval_s'])} ({result['prompt_tokens']}tok, {result['prompt_tokens_per_sec']:.0f}t/s) " f"gen={fmt_time(result['eval_s'])} ({result['eval_tokens']}tok, {result['tokens_per_sec']:.1f}t/s)" ) # Truncate response for display resp_preview = result["response"][:100].replace("\n", " ") print(f" → {resp_preview}") # Summarize avg_wall = statistics.mean(r["wall_time_s"] for r in model_runs) avg_tps = statistics.mean(r["tokens_per_sec"] for r in model_runs) avg_prompt_tps = statistics.mean(r["prompt_tokens_per_sec"] for r in model_runs) avg_tokens = statistics.mean(r["eval_tokens"] for r in model_runs) all_results.append({ "model": model, "avg_wall": fmt_time(avg_wall), "avg_gen_tps": f"{avg_tps:.1f}", "avg_prompt_tps": f"{avg_prompt_tps:.0f}", "avg_tokens": f"{avg_tokens:.0f}", }) print() # Summary table print(" ┌─ LLM Summary ─────────────────────────────────────────────┐") print(f" {'Model':<25s} {'Wall time':>10s} {'Gen t/s':>10s} {'Prompt t/s':>11s} {'Avg tokens':>11s}") print(f" {'─'*25} {'─'*10} {'─'*10} {'─'*11} {'─'*11}") for r in all_results: print(f" {r['model']:<25s} {r['avg_wall']:>10s} {r['avg_gen_tps']:>10s} {r['avg_prompt_tps']:>11s} {r['avg_tokens']:>11s}") print() # ── End-to-end ── if not args.skip_e2e: print("── End-to-End (Bridge → OpenClaw → Ollama → response) ──") print(" (Does not include STT/TTS — just text in → text out via bridge)") e2e_prompt = "What time is it?" for i in range(args.rounds): try: resp, elapsed = benchmark_e2e(e2e_prompt) preview = resp[:100].replace("\n", " ") print(f" Round {i+1}: {fmt_time(elapsed)} → {preview}") except Exception as e: print(f" Round {i+1}: ERROR - {e}") print() # ── Pipeline estimate ── print("=" * 70) print(" ESTIMATED PIPELINE LATENCY (per voice interaction)") print("=" * 70) print(" wake word detection ~instant (runs locally)") print(" + STT (Whisper) see above") print(" + LLM (inference) see above (dominant cost)") print(" + TTS (Kokoro) see above") print(" ─────────────────────────────────────") print(" Tip: smaller models (7B, 32B) dramatically reduce LLM latency.") print(" The 70B model at ~12 tok/s needs ~5-8s for a typical reply.") print(" A 7B model at ~80 tok/s would need <1s for the same reply.") print() if __name__ == "__main__": main()