Files
homeai/homeai-voice/tts/wyoming_kokoro_server.py
Aodhan Collins 664bb6d275 feat: OpenClaw HTTP bridge, HA conversation agent fixes, voice pipeline tooling
- Add openclaw-http-bridge.py: HTTP server translating POST requests to OpenClaw CLI calls
- Add launchd plist for HTTP bridge (port 8081, auto-start)
- Add install-to-docker-ha.sh: deploy custom component to Docker HA via SSH
- Add package-for-ha.sh: create distributable tarball of custom component
- Add test-services.sh: comprehensive voice pipeline service checker

Fixes from code review:
- Use OpenClawAgent (HTTP) in async_setup_entry instead of OpenClawCLIAgent
  (CLI agent fails inside Docker HA where openclaw binary doesn't exist)
- Update all port references from 8080 to 8081 (HTTP bridge port)
- Remove overly permissive CORS headers from HTTP bridge
- Fix zombie process leak: kill child process on CLI timeout
- Remove unused subprocess import in conversation.py
- Add version field to Kokoro TTS Wyoming info
- Update TODO.md with voice pipeline progress
2026-03-08 22:46:04 +00:00

148 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Wyoming TTS server backed by Kokoro ONNX.
Usage:
python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
"""
import argparse
import asyncio
import logging
import os
import numpy as np
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize
_LOGGER = logging.getLogger(__name__)
SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2 # int16
CHANNELS = 1
CHUNK_SECONDS = 1 # stream in 1-second chunks
def _load_kokoro():
from kokoro_onnx import Kokoro
model_dir = os.path.expanduser("~/models/kokoro")
return Kokoro(
os.path.join(model_dir, "kokoro-v1.0.onnx"),
os.path.join(model_dir, "voices-v1.0.bin"),
)
class KokoroEventHandler(AsyncEventHandler):
def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
super().__init__(*args, **kwargs)
self._tts = tts
self._default_voice = default_voice
self._speed = speed
# Send info immediately on connect
asyncio.ensure_future(self._send_info())
async def _send_info(self):
info = Info(
tts=[
TtsProgram(
name="kokoro",
description="Kokoro ONNX TTS",
attribution=Attribution(
name="thewh1teagle/kokoro-onnx",
url="https://github.com/thewh1teagle/kokoro-onnx",
),
installed=True,
version="1.0.0",
voices=[
TtsVoice(
name=self._default_voice,
description="Kokoro voice",
attribution=Attribution(name="kokoro", url=""),
installed=True,
languages=["en-us"],
version="1.0",
speakers=[TtsVoiceSpeaker(name=self._default_voice)],
)
],
)
]
)
await self.write_event(info.event())
async def handle_event(self, event: Event) -> bool:
if Synthesize.is_type(event.type):
synthesize = Synthesize.from_event(event)
text = synthesize.text
voice = self._default_voice
if synthesize.voice and synthesize.voice.name:
voice = synthesize.voice.name
_LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
try:
loop = asyncio.get_event_loop()
samples, sample_rate = await loop.run_in_executor(
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
)
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
audio_bytes = samples_int16.tobytes()
await self.write_event(
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
)
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
for i in range(0, len(audio_bytes), chunk_size):
await self.write_event(
AudioChunk(
rate=SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=CHANNELS,
audio=audio_bytes[i : i + chunk_size],
).event()
)
await self.write_event(AudioStop().event())
_LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
except Exception:
_LOGGER.exception("Synthesis error")
await self.write_event(AudioStop().event())
return True # keep connection open
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
parser.add_argument("--voice", default="af_heart")
parser.add_argument("--speed", type=float, default=1.0)
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
_LOGGER.info("Loading Kokoro ONNX model...")
tts = _load_kokoro()
_LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)
server = AsyncServer.from_uri(args.uri)
def handler_factory(reader, writer):
return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)
await server.run(handler_factory)
if __name__ == "__main__":
asyncio.run(main())