ESPHome  2025.2.2
voice_assistant.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "esphome/core/defines.h"
4 
5 #ifdef USE_VOICE_ASSISTANT
6 
9 #include "esphome/core/helpers.h"
11 
15 #ifdef USE_SPEAKER
17 #endif
18 #ifdef USE_MEDIA_PLAYER
20 #endif
22 
23 #ifdef USE_ESP_ADF
24 #include <esp_vad.h>
25 #endif
26 
27 #include <unordered_map>
28 #include <vector>
29 
30 namespace esphome {
31 namespace voice_assistant {
32 
33 // Version 1: Initial version
34 // Version 2: Adds raw speaker support
35 static const uint32_t LEGACY_INITIAL_VERSION = 1;
36 static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
37 
38 enum VoiceAssistantFeature : uint32_t {
40  FEATURE_SPEAKER = 1 << 1,
42  FEATURE_TIMERS = 1 << 3,
43  FEATURE_ANNOUNCE = 1 << 4,
44 };
45 
46 enum class State {
47  IDLE,
60 };
61 
62 enum AudioMode : uint8_t {
65 };
66 
67 struct Timer {
68  std::string id;
69  std::string name;
70  uint32_t total_seconds;
71  uint32_t seconds_left;
72  bool is_active;
73 
74  std::string to_string() const {
75  return str_sprintf("Timer(id=%s, name=%s, total_seconds=%" PRIu32 ", seconds_left=%" PRIu32 ", is_active=%s)",
76  this->id.c_str(), this->name.c_str(), this->total_seconds, this->seconds_left,
77  YESNO(this->is_active));
78  }
79 };
80 
81 struct WakeWord {
82  std::string id;
83  std::string wake_word;
84  std::vector<std::string> trained_languages;
85 };
86 
87 struct Configuration {
88  std::vector<WakeWord> available_wake_words;
89  std::vector<std::string> active_wake_words;
91 };
92 
93 class VoiceAssistant : public Component {
94  public:
96 
97  void loop() override;
98  float get_setup_priority() const override;
99  void start_streaming();
100  void start_streaming(struct sockaddr_storage *addr, uint16_t port);
101  void failed_to_start();
102 
103  void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; }
104 #ifdef USE_SPEAKER
105  void set_speaker(speaker::Speaker *speaker) {
106  this->speaker_ = speaker;
107  this->local_output_ = true;
108  }
109 #endif
110 #ifdef USE_MEDIA_PLAYER
112  this->media_player_ = media_player;
113  this->local_output_ = true;
114  }
115 #endif
116 
117  uint32_t get_legacy_version() const {
118 #ifdef USE_SPEAKER
119  if (this->speaker_ != nullptr) {
120  return LEGACY_SPEAKER_SUPPORT;
121  }
122 #endif
123  return LEGACY_INITIAL_VERSION;
124  }
125 
126  uint32_t get_feature_flags() const {
127  uint32_t flags = 0;
130 #ifdef USE_SPEAKER
131  if (this->speaker_ != nullptr) {
133  }
134 #endif
135 
136  if (this->has_timers_) {
138  }
139 
140 #ifdef USE_MEDIA_PLAYER
141  if (this->media_player_ != nullptr) {
143  }
144 #endif
145 
146  return flags;
147  }
148 
149  void request_start(bool continuous, bool silence_detection);
150  void request_stop();
151 
152  void on_event(const api::VoiceAssistantEventResponse &msg);
153  void on_audio(const api::VoiceAssistantAudio &msg);
154  void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg);
155  void on_announce(const api::VoiceAssistantAnnounceRequest &msg);
156  void on_set_configuration(const std::vector<std::string> &active_wake_words){};
157  const Configuration &get_configuration() { return this->config_; };
158 
159  bool is_running() const { return this->state_ != State::IDLE; }
160  void set_continuous(bool continuous) { this->continuous_ = continuous; }
161  bool is_continuous() const { return this->continuous_; }
162 
163  void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
164 #ifdef USE_ESP_ADF
165  void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; }
166 #endif
167 
168  void set_noise_suppression_level(uint8_t noise_suppression_level) {
169  this->noise_suppression_level_ = noise_suppression_level;
170  }
171  void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; }
172  void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; }
173  void set_conversation_timeout(uint32_t conversation_timeout) { this->conversation_timeout_ = conversation_timeout; }
174  void reset_conversation_id();
175 
176  Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; }
177  Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; }
178  Trigger<> *get_listening_trigger() const { return this->listening_trigger_; }
179  Trigger<> *get_end_trigger() const { return this->end_trigger_; }
180  Trigger<> *get_start_trigger() const { return this->start_trigger_; }
181  Trigger<> *get_stt_vad_end_trigger() const { return this->stt_vad_end_trigger_; }
182  Trigger<> *get_stt_vad_start_trigger() const { return this->stt_vad_start_trigger_; }
183 #ifdef USE_SPEAKER
184  Trigger<> *get_tts_stream_start_trigger() const { return this->tts_stream_start_trigger_; }
185  Trigger<> *get_tts_stream_end_trigger() const { return this->tts_stream_end_trigger_; }
186 #endif
187  Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
188  Trigger<std::string> *get_stt_end_trigger() const { return this->stt_end_trigger_; }
189  Trigger<std::string> *get_tts_end_trigger() const { return this->tts_end_trigger_; }
190  Trigger<std::string> *get_tts_start_trigger() const { return this->tts_start_trigger_; }
191  Trigger<std::string, std::string> *get_error_trigger() const { return this->error_trigger_; }
192  Trigger<> *get_idle_trigger() const { return this->idle_trigger_; }
193 
194  Trigger<> *get_client_connected_trigger() const { return this->client_connected_trigger_; }
195  Trigger<> *get_client_disconnected_trigger() const { return this->client_disconnected_trigger_; }
196 
197  void client_subscription(api::APIConnection *client, bool subscribe);
198  api::APIConnection *get_api_connection() const { return this->api_client_; }
199 
200  void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; }
201 
202  Trigger<Timer> *get_timer_started_trigger() const { return this->timer_started_trigger_; }
203  Trigger<Timer> *get_timer_updated_trigger() const { return this->timer_updated_trigger_; }
204  Trigger<Timer> *get_timer_cancelled_trigger() const { return this->timer_cancelled_trigger_; }
205  Trigger<Timer> *get_timer_finished_trigger() const { return this->timer_finished_trigger_; }
206  Trigger<std::vector<Timer>> *get_timer_tick_trigger() const { return this->timer_tick_trigger_; }
207  void set_has_timers(bool has_timers) { this->has_timers_ = has_timers; }
208  const std::unordered_map<std::string, Timer> &get_timers() const { return this->timers_; }
209 
210  protected:
211  bool allocate_buffers_();
212  void clear_buffers_();
213  void deallocate_buffers_();
214 
215  int read_microphone_();
216  void set_state_(State state);
217  void set_state_(State state, State desired_state);
218  void signal_stop_();
219  void start_playback_timeout_();
220 
221  std::unique_ptr<socket::Socket> socket_ = nullptr;
222  struct sockaddr_storage dest_addr_;
223 
224  Trigger<> *intent_end_trigger_ = new Trigger<>();
225  Trigger<> *intent_start_trigger_ = new Trigger<>();
226  Trigger<> *listening_trigger_ = new Trigger<>();
227  Trigger<> *end_trigger_ = new Trigger<>();
228  Trigger<> *start_trigger_ = new Trigger<>();
229  Trigger<> *stt_vad_start_trigger_ = new Trigger<>();
230  Trigger<> *stt_vad_end_trigger_ = new Trigger<>();
231 #ifdef USE_SPEAKER
232  Trigger<> *tts_stream_start_trigger_ = new Trigger<>();
233  Trigger<> *tts_stream_end_trigger_ = new Trigger<>();
234 #endif
235  Trigger<> *wake_word_detected_trigger_ = new Trigger<>();
236  Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>();
237  Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>();
238  Trigger<std::string> *tts_start_trigger_ = new Trigger<std::string>();
240  Trigger<> *idle_trigger_ = new Trigger<>();
241 
242  Trigger<> *client_connected_trigger_ = new Trigger<>();
243  Trigger<> *client_disconnected_trigger_ = new Trigger<>();
244 
245  api::APIConnection *api_client_{nullptr};
246 
247  std::unordered_map<std::string, Timer> timers_;
248  void timer_tick_();
249  Trigger<Timer> *timer_started_trigger_ = new Trigger<Timer>();
250  Trigger<Timer> *timer_finished_trigger_ = new Trigger<Timer>();
251  Trigger<Timer> *timer_updated_trigger_ = new Trigger<Timer>();
252  Trigger<Timer> *timer_cancelled_trigger_ = new Trigger<Timer>();
254  bool has_timers_{false};
255  bool timer_tick_running_{false};
256 
257  microphone::Microphone *mic_{nullptr};
258 #ifdef USE_SPEAKER
259  void write_speaker_();
260  speaker::Speaker *speaker_{nullptr};
261  uint8_t *speaker_buffer_{nullptr};
262  size_t speaker_buffer_index_{0};
263  size_t speaker_buffer_size_{0};
264  size_t speaker_bytes_received_{0};
265  bool wait_for_stream_end_{false};
266  bool stream_ended_{false};
267 #endif
268 #ifdef USE_MEDIA_PLAYER
269  media_player::MediaPlayer *media_player_{nullptr};
270 #endif
271 
272  bool local_output_{false};
273 
274  std::string conversation_id_{""};
275 
276  std::string wake_word_{""};
277 
279 
280 #ifdef USE_ESP_ADF
281  vad_handle_t vad_instance_;
282  uint8_t vad_threshold_{5};
283  uint8_t vad_counter_{0};
284 #endif
285  std::unique_ptr<RingBuffer> ring_buffer_;
286 
289  uint8_t auto_gain_;
292 
293  uint8_t *send_buffer_{nullptr};
294  int16_t *input_buffer_{nullptr};
295 
296  bool continuous_{false};
298 
300  State desired_state_{State::IDLE};
301 
302  AudioMode audio_mode_{AUDIO_MODE_UDP};
303  bool udp_socket_running_{false};
304  bool start_udp_socket_();
305 
306  Configuration config_{};
307 };
308 
309 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
310  TEMPLATABLE_VALUE(std::string, wake_word);
311 
312  public:
313  void play(Ts... x) override {
314  this->parent_->set_wake_word(this->wake_word_.value(x...));
315  this->parent_->request_start(false, this->silence_detection_);
316  }
317 
318  void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; }
319 
320  protected:
322 };
323 
324 template<typename... Ts> class StartContinuousAction : public Action<Ts...>, public Parented<VoiceAssistant> {
325  public:
326  void play(Ts... x) override { this->parent_->request_start(true, true); }
327 };
328 
329 template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<VoiceAssistant> {
330  public:
331  void play(Ts... x) override { this->parent_->request_stop(); }
332 };
333 
334 template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
335  public:
336  bool check(Ts... x) override { return this->parent_->is_running() || this->parent_->is_continuous(); }
337 };
338 
339 template<typename... Ts> class ConnectedCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
340  public:
341  bool check(Ts... x) override { return this->parent_->get_api_connection() != nullptr; }
342 };
343 
344 extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
345 
346 } // namespace voice_assistant
347 } // namespace esphome
348 
349 #endif // USE_VOICE_ASSISTANT
Trigger< Timer > * get_timer_finished_trigger() const
void loop()
void set_microphone(microphone::Microphone *mic)
Trigger< Timer > * get_timer_started_trigger() const
void on_set_configuration(const std::vector< std::string > &active_wake_words)
std::unordered_map< std::string, Timer > timers_
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
Trigger< Timer > * get_timer_cancelled_trigger() const
uint16_t x
Definition: tt21100.cpp:17
Helper class to request loop() to be called as fast as possible.
Definition: helpers.h:630
api::APIConnection * get_api_connection() const
const std::unordered_map< std::string, Timer > & get_timers() const
Trigger< std::vector< Timer > > * get_timer_tick_trigger() const
void set_wake_word(const std::string &wake_word)
void set_noise_suppression_level(uint8_t noise_suppression_level)
std::vector< std::string > trained_languages
void set_volume_multiplier(float volume_multiplier)
Base class for all automation conditions.
Definition: automation.h:74
Trigger< std::string > * get_tts_start_trigger() const
std::string str_sprintf(const char *fmt,...)
Definition: helpers.cpp:324
void set_conversation_timeout(uint32_t conversation_timeout)
std::vector< std::string > active_wake_words
const uint32_t flags
Definition: stm32flash.h:85
Trigger< std::string, std::string > * get_error_trigger() const
std::unique_ptr< RingBuffer > ring_buffer_
void set_speaker(speaker::Speaker *speaker)
void set_vad_threshold(uint8_t vad_threshold)
Trigger< std::string > * get_tts_end_trigger() const
void set_use_wake_word(bool use_wake_word)
void set_media_player(media_player::MediaPlayer *media_player)
std::vector< WakeWord > available_wake_words
void set_silence_detection(bool silence_detection)
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
Trigger< std::string > * get_stt_end_trigger() const
Trigger< Timer > * get_timer_updated_trigger() const
Helper class to easily give an object a parent of type T.
Definition: helpers.h:538
bool state
Definition: fan.h:34