Complete P2 (LLM) and P3 (voice pipeline) implementation
P2 — homeai-llm: - Fix ollama launchd plist path for Apple Silicon (/opt/homebrew/bin/ollama) - Add Modelfiles for local GGUF models: llama3.3:70b, qwen3:32b, codestral:22b (registered via `ollama create` — no re-download needed) P3 — homeai-voice: - Wyoming STT: wyoming-faster-whisper, large-v3 model, port 10300 - Wyoming TTS: custom Kokoro ONNX server (wyoming_kokoro_server.py), port 10301 Voice af_heart; models at ~/models/kokoro/ - Wake word: openWakeWord daemon (hey_jarvis), notifies OpenClaw at /wake - launchd plists for all three services + load-all-launchd.sh helper - Smoke test: wyoming/test-pipeline.sh — 3/3 passing HA Wyoming integration pending manual UI config (STT 10.0.0.200:10300, TTS 10.0.0.200:10301). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
145
homeai-voice/tts/wyoming_kokoro_server.py
Normal file
145
homeai-voice/tts/wyoming_kokoro_server.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Wyoming TTS server backed by Kokoro ONNX.
|
||||
|
||||
Usage:
|
||||
python wyoming_kokoro_server.py --uri tcp://0.0.0.0:10301 --voice af_heart
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||
from wyoming.event import Event
|
||||
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
|
||||
from wyoming.server import AsyncEventHandler, AsyncServer
|
||||
from wyoming.tts import Synthesize
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
SAMPLE_RATE = 24000
|
||||
SAMPLE_WIDTH = 2 # int16
|
||||
CHANNELS = 1
|
||||
CHUNK_SECONDS = 1 # stream in 1-second chunks
|
||||
|
||||
|
||||
def _load_kokoro():
|
||||
from kokoro_onnx import Kokoro
|
||||
model_dir = os.path.expanduser("~/models/kokoro")
|
||||
return Kokoro(
|
||||
os.path.join(model_dir, "kokoro-v1.0.onnx"),
|
||||
os.path.join(model_dir, "voices-v1.0.bin"),
|
||||
)
|
||||
|
||||
|
||||
class KokoroEventHandler(AsyncEventHandler):
|
||||
def __init__(self, tts, default_voice: str, speed: float, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._tts = tts
|
||||
self._default_voice = default_voice
|
||||
self._speed = speed
|
||||
|
||||
# Send info immediately on connect
|
||||
asyncio.ensure_future(self._send_info())
|
||||
|
||||
async def _send_info(self):
|
||||
info = Info(
|
||||
tts=[
|
||||
TtsProgram(
|
||||
name="kokoro",
|
||||
description="Kokoro ONNX TTS",
|
||||
attribution=Attribution(
|
||||
name="thewh1teagle/kokoro-onnx",
|
||||
url="https://github.com/thewh1teagle/kokoro-onnx",
|
||||
),
|
||||
installed=True,
|
||||
voices=[
|
||||
TtsVoice(
|
||||
name=self._default_voice,
|
||||
description="Kokoro voice",
|
||||
attribution=Attribution(name="kokoro", url=""),
|
||||
installed=True,
|
||||
languages=["en-us"],
|
||||
speakers=[TtsVoiceSpeaker(name=self._default_voice)],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
await self.write_event(info.event())
|
||||
|
||||
async def handle_event(self, event: Event) -> bool:
|
||||
if Synthesize.is_type(event.type):
|
||||
synthesize = Synthesize.from_event(event)
|
||||
text = synthesize.text
|
||||
voice = self._default_voice
|
||||
|
||||
if synthesize.voice and synthesize.voice.name:
|
||||
voice = synthesize.voice.name
|
||||
|
||||
_LOGGER.debug("Synthesizing %r with voice=%s speed=%.1f", text, voice, self._speed)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
samples, sample_rate = await loop.run_in_executor(
|
||||
None, lambda: self._tts.create(text, voice=voice, speed=self._speed)
|
||||
)
|
||||
|
||||
samples_int16 = (np.clip(samples, -1.0, 1.0) * 32767).astype(np.int16)
|
||||
audio_bytes = samples_int16.tobytes()
|
||||
|
||||
await self.write_event(
|
||||
AudioStart(rate=SAMPLE_RATE, width=SAMPLE_WIDTH, channels=CHANNELS).event()
|
||||
)
|
||||
|
||||
chunk_size = SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * CHUNK_SECONDS
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
await self.write_event(
|
||||
AudioChunk(
|
||||
rate=SAMPLE_RATE,
|
||||
width=SAMPLE_WIDTH,
|
||||
channels=CHANNELS,
|
||||
audio=audio_bytes[i : i + chunk_size],
|
||||
).event()
|
||||
)
|
||||
|
||||
await self.write_event(AudioStop().event())
|
||||
_LOGGER.info("Synthesized %.1fs of audio", len(samples) / sample_rate)
|
||||
|
||||
except Exception:
|
||||
_LOGGER.exception("Synthesis error")
|
||||
await self.write_event(AudioStop().event())
|
||||
|
||||
return True # keep connection open
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--uri", default="tcp://0.0.0.0:10301")
|
||||
parser.add_argument("--voice", default="af_heart")
|
||||
parser.add_argument("--speed", type=float, default=1.0)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.debug else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
|
||||
_LOGGER.info("Loading Kokoro ONNX model...")
|
||||
tts = _load_kokoro()
|
||||
_LOGGER.info("Kokoro loaded. Starting Wyoming TTS on %s (voice=%s)", args.uri, args.voice)
|
||||
|
||||
server = AsyncServer.from_uri(args.uri)
|
||||
|
||||
def handler_factory(reader, writer):
|
||||
return KokoroEventHandler(tts, args.voice, args.speed, reader, writer)
|
||||
|
||||
await server.run(handler_factory)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user