feat: Raspberry Pi 5 kitchen satellite — Wyoming voice satellite with ReSpeaker pHAT

Add full Pi 5 satellite setup with ReSpeaker 2-Mics pHAT for kitchen
voice control via Wyoming protocol. Includes satellite_wrapper.py that
monkey-patches WakeStreamingSatellite to fix three compounding bugs:

- TTS echo suppression: mutes wake word detection while speaker plays
- Server writer race fix: checks _writer before streaming, re-arms on None
- Streaming timeout: auto-recovers after 30s if pipeline hangs
- Error recovery: resets streaming state on server Error events

Also includes Pi 5 hardware workarounds (wm8960 overlay, stereo-only
audio wrappers, ALSA mixer calibration) and deploy.sh with fast
iteration commands (--push-wrapper, --test-logs).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Aodhan Collins
2026-03-14 20:09:47 +00:00
parent 5f147cae61
commit 1e52c002c2
7 changed files with 1024 additions and 79 deletions

View File

@@ -64,6 +64,9 @@ ESP32-S3-BOX-3
| Display Reset | GPIO48 | inverted |
| Backlight | GPIO47 | LEDC PWM |
| Left top button | GPIO0 | strapping pin — mute toggle / factory reset |
| Sensor dock I2C SCL | GPIO40 | sensor bus (AHT-30, AT581x radar) |
| Sensor dock I2C SDA | GPIO41 | sensor bus (AHT-30, AT581x radar) |
| Radar presence output | GPIO21 | AT581x digital detection pin |
---
@@ -102,7 +105,18 @@ On-device `micro_wake_word` component with `hey_jarvis` model. Can optionally be
### Display
`ili9xxx` platform with model `S3BOX`. Uses `update_interval: never` — display updates are triggered by scripts on voice assistant state changes. Static 320×240 PNG images for each state are compiled into firmware.
`ili9xxx` platform with model `S3BOX`. Uses `update_interval: never` — display updates are triggered by scripts on voice assistant state changes. Static 320×240 PNG images for each state are compiled into firmware. No text overlays — voice-only interaction.
Screen auto-dims after a configurable idle timeout (default 1 min, adjustable 160 min via HA entity). Wakes on voice activity or radar presence detection.
### Sensor Dock (ESP32-S3-BOX-3-SENSOR)
Optional accessory dock connected via secondary I2C bus (GPIO40/41, 100kHz):
- **AHT-30** (temp/humidity) — `aht10` component with variant AHT20, 30s update interval
- **AT581x mmWave radar** — presence detection via GPIO21, I2C for settings config
- **Radar RF switch** — toggle radar on/off from HA
- Radar configured on boot: sensing_distance=600, trigger_keep=5s, hw_frontend_reset=true
### Voice Assistant
@@ -202,7 +216,7 @@ cd ~/gitea/homeai/homeai-esp32
## Known Constraints
- **Memory**: voice_assistant + micro_wake_word + display is near the limit. Do NOT add Bluetooth or LVGL widgets — they will cause crashes.
- **Memory**: voice_assistant + micro_wake_word + display + sensor dock is near the limit. Do NOT add Bluetooth or LVGL widgets — they will cause crashes.
- **WiFi**: 2.4GHz only. 5GHz networks are not supported.
- **Speaker**: 1W built-in. Volume capped at 85% to avoid distortion.
- **Display**: Static PNGs compiled into firmware. To change images, reflash via OTA (~1-2 min).

View File

@@ -116,7 +116,28 @@ check_images() {
Place 320x240 PNGs in ${ILLUSTRATIONS_DIR}/ or use --regen-images to generate placeholders."
fi
log_ok "All ${#REQUIRED_IMAGES[@]} illustrations present in illustrations/"
# Resize any images that aren't 320x240
local resized=0
for name in "${REQUIRED_IMAGES[@]}"; do
local img_path="${ILLUSTRATIONS_DIR}/${name}.png"
local dims
dims=$("${PYTHON}" -c "from PIL import Image; im=Image.open('${img_path}'); print(f'{im.width}x{im.height}')")
if [[ "$dims" != "320x240" ]]; then
log_warn "${name}.png is ${dims}, resizing to 320x240..."
"${PYTHON}" -c "
from PIL import Image
im = Image.open('${img_path}')
im = im.resize((320, 240), Image.LANCZOS)
im.save('${img_path}')
"
resized=$((resized + 1))
fi
done
if [[ $resized -gt 0 ]]; then
log_ok "Resized ${resized} image(s) to 320x240"
fi
log_ok "All ${#REQUIRED_IMAGES[@]} illustrations present and 320x240"
for name in "${REQUIRED_IMAGES[@]}"; do
local size
size=$(wc -c < "${ILLUSTRATIONS_DIR}/${name}.png" | tr -d ' ')
@@ -208,10 +229,8 @@ if $REGEN_IMAGES; then
generate_faces
fi
# Check existing images if deploying with --images-only (or always before deploy)
if $IMAGES_ONLY; then
check_images
fi
# Check existing images (verify present + resize if not 320x240)
check_images
# Validate only
if $VALIDATE_ONLY; then

View File

@@ -33,8 +33,8 @@ substitutions:
voice_assist_muted_phase_id: "12"
voice_assist_timer_finished_phase_id: "20"
font_glyphsets: "GF_Latin_Core"
font_family: Figtree
font_glyphsets: "GF_Latin_Core"
esphome:
name: ${name}
@@ -45,6 +45,11 @@ esphome:
priority: 600
then:
- script.execute: draw_display
- at581x.settings:
id: radar
hw_frontend_reset: true
sensing_distance: 600
trigger_keep: 5000ms
- delay: 30s
- if:
condition:
@@ -136,6 +141,14 @@ binary_sensor:
- ON for at least 10s
then:
- button.press: factory_reset_btn
- platform: gpio
pin: GPIO21
name: Presence
id: radar_presence
device_class: occupancy
on_press:
- script.execute: screen_wake
- script.execute: screen_idle_timer
# --- Display backlight ---
@@ -157,8 +170,13 @@ light:
# --- Audio hardware ---
i2c:
scl: GPIO18
sda: GPIO8
- id: audio_bus
scl: GPIO18
sda: GPIO8
- id: sensor_bus
scl: GPIO40
sda: GPIO41
frequency: 100kHz
i2s_audio:
- id: i2s_audio_bus
@@ -171,12 +189,14 @@ i2s_audio:
audio_adc:
- platform: es7210
id: es7210_adc
i2c_id: audio_bus
bits_per_sample: 16bit
sample_rate: 16000
audio_dac:
- platform: es8311
id: es8311_dac
i2c_id: audio_bus
bits_per_sample: 16bit
sample_rate: 48000
@@ -265,25 +285,11 @@ voice_assistant:
volume_multiplier: 2.0
on_listening:
- lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
- text_sensor.template.publish:
id: text_request
state: "..."
- text_sensor.template.publish:
id: text_response
state: "..."
- script.execute: draw_display
on_stt_vad_end:
- lambda: id(voice_assistant_phase) = ${voice_assist_thinking_phase_id};
- script.execute: draw_display
on_stt_end:
- text_sensor.template.publish:
id: text_request
state: !lambda return x;
- script.execute: draw_display
on_tts_start:
- text_sensor.template.publish:
id: text_response
state: !lambda return x;
- lambda: id(voice_assistant_phase) = ${voice_assist_replying_phase_id};
- script.execute: draw_display
on_end:
@@ -305,12 +311,6 @@ voice_assistant:
- micro_wake_word.start:
- script.execute: set_idle_or_mute_phase
- script.execute: draw_display
- text_sensor.template.publish:
id: text_request
state: ""
- text_sensor.template.publish:
id: text_response
state: ""
on_error:
- if:
condition:
@@ -371,36 +371,43 @@ script:
- lambda: |
switch(id(voice_assistant_phase)) {
case ${voice_assist_listening_phase_id}:
id(screen_wake).execute();
id(s3_box_lcd).show_page(listening_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_thinking_phase_id}:
id(screen_wake).execute();
id(s3_box_lcd).show_page(thinking_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_replying_phase_id}:
id(screen_wake).execute();
id(s3_box_lcd).show_page(replying_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_error_phase_id}:
id(screen_wake).execute();
id(s3_box_lcd).show_page(error_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_muted_phase_id}:
id(s3_box_lcd).show_page(muted_page);
id(s3_box_lcd).update();
id(screen_idle_timer).execute();
break;
case ${voice_assist_not_ready_phase_id}:
id(s3_box_lcd).show_page(no_ha_page);
id(s3_box_lcd).update();
break;
case ${voice_assist_timer_finished_phase_id}:
id(screen_wake).execute();
id(s3_box_lcd).show_page(timer_finished_page);
id(s3_box_lcd).update();
break;
default:
id(s3_box_lcd).show_page(idle_page);
id(s3_box_lcd).update();
id(screen_idle_timer).execute();
}
else:
- display.page.show: no_ha_page
@@ -545,6 +552,23 @@ script:
else:
- lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
- id: screen_idle_timer
mode: restart
then:
- delay: !lambda return id(screen_off_delay).state * 60000;
- light.turn_off: led
- id: screen_wake
mode: restart
then:
- if:
condition:
light.is_off: led
then:
- light.turn_on:
id: led
brightness: 100%
# --- Switches ---
switch:
@@ -556,6 +580,10 @@ switch:
restore_mode: RESTORE_DEFAULT_ON
entity_category: config
disabled_by_default: true
- platform: at581x
at581x_id: radar
name: Radar RF
entity_category: config
- platform: template
name: Mute
id: mute
@@ -646,6 +674,46 @@ select:
then:
- micro_wake_word.start
# --- Screen idle timeout (minutes) ---
number:
- platform: template
name: Screen off delay
id: screen_off_delay
icon: "mdi:timer-outline"
entity_category: config
unit_of_measurement: min
optimistic: true
restore_value: true
min_value: 1
max_value: 60
step: 1
initial_value: 1
# --- Sensor dock (ESP32-S3-BOX-3-SENSOR) ---
sensor:
- platform: aht10
variant: AHT20
i2c_id: sensor_bus
temperature:
name: Temperature
filters:
- sliding_window_moving_average:
window_size: 5
send_every: 5
humidity:
name: Humidity
filters:
- sliding_window_moving_average:
window_size: 5
send_every: 5
update_interval: 30s
at581x:
i2c_id: sensor_bus
id: radar
# --- Global variables ---
globals:
@@ -719,26 +787,9 @@ image:
type: RGB
transparency: alpha_channel
# --- Fonts ---
# --- Fonts (timer widget only) ---
font:
- file:
type: gfonts
family: ${font_family}
weight: 300
italic: true
id: font_request
size: 15
glyphsets:
- ${font_glyphsets}
- file:
type: gfonts
family: ${font_family}
weight: 300
id: font_response
size: 15
glyphsets:
- ${font_glyphsets}
- file:
type: gfonts
family: ${font_family}
@@ -748,28 +799,6 @@ font:
glyphsets:
- ${font_glyphsets}
# --- Text sensors (request/response display) ---
text_sensor:
- id: text_request
platform: template
on_value:
lambda: |-
if(id(text_request).state.length()>32) {
std::string name = id(text_request).state.c_str();
std::string truncated = esphome::str_truncate(name.c_str(),31);
id(text_request).state = (truncated+"...").c_str();
}
- id: text_response
platform: template
on_value:
lambda: |-
if(id(text_response).state.length()>32) {
std::string name = id(text_response).state.c_str();
std::string truncated = esphome::str_truncate(name.c_str(),31);
id(text_response).state = (truncated+"...").c_str();
}
# --- Colors ---
color:
@@ -825,20 +854,11 @@ display:
lambda: |-
it.fill(id(thinking_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_thinking), ImageAlign::CENTER);
it.filled_rectangle(20, 20, 280, 30, Color::WHITE);
it.rectangle(20, 20, 280, 30, Color::BLACK);
it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
id(draw_timer_timeline).execute();
- id: replying_page
lambda: |-
it.fill(id(replying_color));
it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_replying), ImageAlign::CENTER);
it.filled_rectangle(20, 20, 280, 30, Color::WHITE);
it.rectangle(20, 20, 280, 30, Color::BLACK);
it.filled_rectangle(20, 190, 280, 30, Color::WHITE);
it.rectangle(20, 190, 280, 30, Color::BLACK);
it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
it.printf(30, 195, id(font_response), Color::BLACK, "%s", id(text_response).state.c_str());
id(draw_timer_timeline).execute();
- id: timer_finished_page
lambda: |-