feat: complete voice pipeline — fix wake word crash, bridge timeout, HA conversation agent

- Fix Wyoming satellite crash on wake word: convert macOS .aiff chimes to .wav
  (Python wave module only reads RIFF format, not AIFF)
- Fix OpenClaw HTTP bridge: increase subprocess timeout 30s → 120s, add SO_REUSEADDR
- Fix HA conversation component: use HTTP agent (not CLI) since HA runs in Docker
  on a different machine; update default host to Mac Mini IP, timeout to 120s
- Rewrite character manager as Vite+React app with schema validation
- Add Wyoming satellite wake word command, ElevenLabs TTS server, wakeword monitor
- Add Phase 5 development plan
- Update TODO.md: mark voice pipeline and agent tasks complete

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-11 00:15:55 +00:00
parent 664bb6d275
commit 6db8ae4492
34 changed files with 4649 additions and 1083 deletions

View File

@@ -26,8 +26,29 @@ import argparse
import json
import subprocess
import sys
import asyncio
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.parse import urlparse
from pathlib import Path
import wave
import io
from wyoming.client import AsyncTcpClient
from wyoming.tts import Synthesize
from wyoming.audio import AudioStart, AudioChunk, AudioStop
from wyoming.info import Info
def load_character_prompt() -> str:
"""Load the active character system prompt."""
character_path = Path.home() / ".openclaw" / "characters" / "aria.json"
if not character_path.exists():
return ""
try:
with open(character_path) as f:
data = json.load(f)
return data.get("system_prompt", "")
except Exception:
return ""
class OpenClawBridgeHandler(BaseHTTPRequestHandler):
@@ -48,17 +69,129 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
"""Handle POST requests."""
parsed_path = urlparse(self.path)
# Only handle the agent message endpoint
if parsed_path.path != "/api/agent/message":
self._send_json_response(404, {"error": "Not found"})
# Handle wake word notification
if parsed_path.path == "/wake":
self._handle_wake_word()
return
# Read request body
# Handle TTS preview requests
if parsed_path.path == "/api/tts":
self._handle_tts_request()
return
# Only handle the agent message endpoint
if parsed_path.path == "/api/agent/message":
self._handle_agent_request()
return
self._send_json_response(404, {"error": "Not found"})
def _handle_tts_request(self):
"""Handle TTS request and return wav audio."""
content_length = int(self.headers.get("Content-Length", 0))
if content_length == 0:
self._send_json_response(400, {"error": "Empty request body"})
self._send_json_response(400, {"error": "Empty body"})
return
try:
body = self.rfile.read(content_length).decode()
data = json.loads(body)
except json.JSONDecodeError:
self._send_json_response(400, {"error": "Invalid JSON"})
return
text = data.get("text", "Hello, this is a test.")
voice = data.get("voice", "af_heart")
try:
# Run the async Wyoming client
audio_bytes = asyncio.run(self._synthesize_audio(text, voice))
# Send WAV response
self.send_response(200)
self.send_header("Content-Type", "audio/wav")
# Allow CORS for local testing from Vite
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
self.wfile.write(audio_bytes)
except Exception as e:
self._send_json_response(500, {"error": str(e)})
def do_OPTIONS(self):
"""Handle CORS preflight requests."""
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
async def _synthesize_audio(self, text: str, voice: str) -> bytes:
"""Connect to Wyoming TTS server and get audio bytes."""
client = AsyncTcpClient("127.0.0.1", 10301)
await client.connect()
# Read the initial Info event
await client.read_event()
# Send Synthesize event
await client.write_event(Synthesize(text=text, voice=voice).event())
audio_data = bytearray()
rate = 24000
width = 2
channels = 1
while True:
event = await client.read_event()
if event is None:
break
if AudioStart.is_type(event.type):
start = AudioStart.from_event(event)
rate = start.rate
width = start.width
channels = start.channels
elif AudioChunk.is_type(event.type):
chunk = AudioChunk.from_event(event)
audio_data.extend(chunk.audio)
elif AudioStop.is_type(event.type):
break
await client.disconnect()
# Package raw PCM into WAV
wav_io = io.BytesIO()
with wave.open(wav_io, 'wb') as wav_file:
wav_file.setnchannels(channels)
wav_file.setsampwidth(width)
wav_file.setframerate(rate)
wav_file.writeframes(audio_data)
return wav_io.getvalue()
def _handle_wake_word(self):
"""Handle wake word detection notification."""
content_length = int(self.headers.get("Content-Length", 0))
wake_word_data = {}
if content_length > 0:
try:
body = self.rfile.read(content_length).decode()
wake_word_data = json.loads(body)
except (json.JSONDecodeError, ConnectionResetError, OSError):
# Client may close connection early, that's ok
pass
print(f"[OpenClaw Bridge] Wake word detected: {wake_word_data.get('wake_word', 'unknown')}")
self._send_json_response(200, {"status": "ok", "message": "Wake word received"})
def _handle_agent_request(self):
"""Handle agent message request."""
content_length = int(self.headers.get("Content-Length", 0))
if content_length == 0:
self._send_json_response(400, {"error": "Empty body"})
return
try:
body = self.rfile.read(content_length).decode()
data = json.loads(body)
@@ -66,21 +199,25 @@ class OpenClawBridgeHandler(BaseHTTPRequestHandler):
self._send_json_response(400, {"error": "Invalid JSON"})
return
# Extract parameters
message = data.get("message", "").strip()
message = data.get("message")
agent = data.get("agent", "main")
if not message:
self._send_json_response(400, {"error": "Message is required"})
return
# Inject system prompt
system_prompt = load_character_prompt()
if system_prompt:
message = f"System Context: {system_prompt}\n\nUser Request: {message}"
# Call OpenClaw CLI (use full path for launchd compatibility)
try:
result = subprocess.run(
["/opt/homebrew/bin/openclaw", "agent", "--message", message, "--agent", agent],
capture_output=True,
text=True,
timeout=30,
timeout=120,
check=True
)
response_text = result.stdout.strip()
@@ -125,6 +262,7 @@ def main():
)
args = parser.parse_args()
HTTPServer.allow_reuse_address = True
server = HTTPServer((args.host, args.port), OpenClawBridgeHandler)
print(f"OpenClaw HTTP Bridge running on http://{args.host}:{args.port}")
print(f"Endpoint: POST http://{args.host}:{args.port}/api/agent/message")