Files
homeai/homeai-voice/tts/wyoming_kokoro_server.py
Aodhan Collins 6a0bae2a0b feat(phase-04): Wyoming Satellite integration + OpenClaw HA components
## Voice Pipeline (P3)
- Replace openWakeWord daemon with Wyoming Satellite approach
- Add Wyoming Satellite service on port 10700 for HA voice pipeline
- Update setup.sh with cross-platform sed compatibility (macOS/Linux)
- Add version field to Kokoro TTS voice info
- Update launchd service loader to use Wyoming Satellite

## Home Assistant Integration (P4)
- Add custom conversation agent component (openclaw_conversation)
  - Fix: Use IntentResponse instead of plain strings (HA API requirement)
  - Support both HTTP API and CLI fallback modes
  - Config flow for easy HA UI setup
- Add OpenClaw bridge scripts (Python + Bash)
- Add ha-ctl utility for HA entity control
  - Fix: Use context manager for token file reading
- Add HA configuration examples and documentation

## Infrastructure
- Add mem0 backup automation (launchd + script)
- Add n8n workflow templates (morning briefing, notification router)
- Add VS Code workspace configuration
- Reorganize model files into categorized folders:
  - lmstudio-community/
  - mlx-community/
  - bartowski/
  - mradermacher/

## Documentation
- Update PROJECT_PLAN.md with Wyoming Satellite architecture
- Update TODO.md with completed Wyoming integration tasks
- Add OPENCLAW_INTEGRATION.md for HA setup guide

## Testing
- Verified Wyoming services running (STT:10300, TTS:10301, Satellite:10700)
- Verified OpenClaw CLI accessibility
- Confirmed cross-platform compatibility fixes
2026-03-08 02:06:37 +00:00

147 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Wyoming TTS server backed by Kokoro ONNX.
Usage:
python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
"""
import argparse
import asyncio
import logging
import os
import numpy as np
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize
_LOGGER = logging.getLogger(__name__)
SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2 # int16
CHANNELS = 1
CHUNK_SECONDS = 1 # stream in 1-second chunks
def _load_kokoro():
from kokoro_onnx import Kokoro
model_dir = os.path.expanduser("~/models/kokoro")
return Kokoro(
os.path.join(model_dir, "kokoro-v1.0.onnx"),
os.path.join(model_dir, "voices-v1.0.bin"),
)
class KokoroEventHandler(AsyncEventHandler):
def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
super().__init__(*args, **kwargs)
self._tts = tts
self._default_voice = default_voice
self._speed = speed
# Send info immediately on connect
asyncio.ensure_future(self._send_info())
async def _send_info(self):
info = Info(
tts=[
TtsProgram(
name="kokoro",
description="Kokoro ONNX TTS",
attribution=Attribution(
name="thewh1teagle/kokoro-onnx",
url="https://github.com/thewh1teagle/kokoro-onnx",
),
installed=True,
voices=[
TtsVoice(
name=self._default_voice,
description="Kokoro voice",
attribution=Attribution(name="kokoro", url=""),
installed=True,
languages=["en-us"],
version="1.0",
speakers=[TtsVoiceSpeaker(name=self._default_voice)],
)
],
)
]
)
await self.write_event(info.event())
async def handle_event(self, event: Event) -> bool:
if Synthesize.is_type(event.type):
synthesize = Synthesize.from_event(event)
text = synthesize.text
voice = self._default_voice
if synthesize.voice and synthesize.voice.name:
voice = synthesize.voice.name
_LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
try:
loop = asyncio.get_event_loop()
samples, sample_rate = await loop.run_in_executor(
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
)
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
audio_bytes = samples_int16.tobytes()
await self.write_event(
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
)
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
for i in range(0, len(audio_bytes), chunk_size):
await self.write_event(
AudioChunk(
rate=SAMPLE_RATE,
width=SAMPLE_WIDTH,
channels=CHANNELS,
audio=audio_bytes[i : i + chunk_size],
).event()
)
await self.write_event(AudioStop().event())
_LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
except Exception:
_LOGGER.exception("Synthesis error")
await self.write_event(AudioStop().event())
return True # keep connection open
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
parser.add_argument("--voice", default="af_heart")
parser.add_argument("--speed", type=float, default=1.0)
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
_LOGGER.info("Loading Kokoro ONNX model...")
tts = _load_kokoro()
_LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)
server = AsyncServer.from_uri(args.uri)
def handler_factory(reader, writer):
return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)
await server.run(handler_factory)
if __name__ == "__main__":
asyncio.run(main())