11 #include <frontend_util.h> 13 #include <tensorflow/lite/core/c/common.h> 14 #include <tensorflow/lite/micro/micro_interpreter.h> 15 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h> 20 namespace micro_wake_word {
22 static const char *
const TAG =
"micro_wake_word";
24 static const size_t SAMPLE_RATE_HZ = 16000;
25 static const size_t BUFFER_LENGTH = 64;
26 static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
27 static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000;
31 static const LogString *micro_wake_word_state_to_string(
State state) {
34 return LOG_STR(
"IDLE");
36 return LOG_STR(
"START_MICROPHONE");
38 return LOG_STR(
"STARTING_MICROPHONE");
40 return LOG_STR(
"DETECTING_WAKE_WORD");
42 return LOG_STR(
"STOP_MICROPHONE");
44 return LOG_STR(
"STOPPING_MICROPHONE");
46 return LOG_STR(
"UNKNOWN");
51 ESP_LOGCONFIG(TAG,
"microWakeWord:");
52 ESP_LOGCONFIG(TAG,
" models:");
54 model.log_model_config();
56 #ifdef USE_MICRO_WAKE_WORD_VAD 62 ESP_LOGCONFIG(TAG,
"Setting up microWakeWord...");
69 ESP_LOGCONFIG(TAG,
"Micro Wake Word initialized");
89 size_t sliding_window_average_size,
const std::string &wake_word,
90 size_t tensor_arena_size) {
91 this->
wake_word_models_.emplace_back(model_start, probability_cutoff, sliding_window_average_size, wake_word,
95 #ifdef USE_MICRO_WAKE_WORD_VAD 97 size_t tensor_arena_size) {
98 this->
vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
107 ESP_LOGD(TAG,
"Starting Microphone");
129 ESP_LOGD(TAG,
"Stopping Microphone");
151 ESP_LOGW(TAG,
"Wake word detection can't start as the component hasn't been setup yet");
156 ESP_LOGW(TAG,
"Wake word component is marked as failed. Please check setup logs");
161 ESP_LOGE(TAG,
"Failed to load the wake word model(s) or allocate buffers");
168 ESP_LOGW(TAG,
"Wake word component has an error. Please check logs");
173 ESP_LOGW(TAG,
"Wake word is already running");
183 ESP_LOGW(TAG,
"Wake word is already stopped");
187 ESP_LOGW(TAG,
"Wake word is already stopping");
194 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->
state_)),
195 LOG_STR_ARG(micro_wake_word_state_to_string(state)));
201 if (bytes_read == 0) {
207 if (bytes_free < bytes_read) {
209 "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " 210 "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
211 bytes_free, bytes_read);
225 ESP_LOGE(TAG,
"Could not allocate input buffer");
233 ESP_LOGE(TAG,
"Could not allocate the audio preprocessor's buffer.");
241 ESP_LOGE(TAG,
"Could not allocate ring buffer");
260 ESP_LOGD(TAG,
"Failed to populate frontend state");
267 if (!model.load_model(this->streaming_op_resolver_)) {
268 ESP_LOGE(TAG,
"Failed to initialize a wake word model.");
272 #ifdef USE_MICRO_WAKE_WORD_VAD 273 if (!this->
vad_model_->load_model(this->streaming_op_resolver_)) {
274 ESP_LOGE(TAG,
"Failed to initialize VAD model.");
286 model.unload_model();
288 #ifdef USE_MICRO_WAKE_WORD_VAD 294 int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
305 model.perform_streaming_inference(audio_features);
307 #ifdef USE_MICRO_WAKE_WORD_VAD 308 this->
vad_model_->perform_streaming_inference(audio_features);
318 #ifdef USE_MICRO_WAKE_WORD_VAD 319 bool vad_state = this->
vad_model_->determine_detected();
323 if (model.determine_detected()) {
324 #ifdef USE_MICRO_WAKE_WORD_VAD 329 #ifdef USE_MICRO_WAKE_WORD_VAD 331 ESP_LOGD(TAG,
"Wake word model predicts %s, but VAD model doesn't.", model.get_wake_word().c_str());
354 if (bytes_read == 0) {
355 ESP_LOGE(TAG,
"Could not read data from Ring Buffer");
357 ESP_LOGD(TAG,
"Partial Read of Data by Model");
358 ESP_LOGD(TAG,
"Could only read %d bytes when required %d bytes ", bytes_read,
363 size_t num_samples_read;
364 struct FrontendOutput frontend_output = FrontendProcessSamples(
367 for (
size_t i = 0; i < frontend_output.size; ++i) {
382 constexpr int32_t value_scale = 256;
383 constexpr int32_t value_div = 666;
384 int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
399 ESP_LOGD(TAG,
"Resetting buffers and probabilities");
403 model.reset_probabilities();
405 #ifdef USE_MICRO_WAKE_WORD_VAD 411 if (op_resolver.AddCallOnce() != kTfLiteOk)
413 if (op_resolver.AddVarHandle() != kTfLiteOk)
415 if (op_resolver.AddReshape() != kTfLiteOk)
417 if (op_resolver.AddReadVariable() != kTfLiteOk)
419 if (op_resolver.AddStridedSlice() != kTfLiteOk)
421 if (op_resolver.AddConcatenation() != kTfLiteOk)
423 if (op_resolver.AddAssignVariable() != kTfLiteOk)
425 if (op_resolver.AddConv2D() != kTfLiteOk)
427 if (op_resolver.AddMul() != kTfLiteOk)
429 if (op_resolver.AddAdd() != kTfLiteOk)
431 if (op_resolver.AddMean() != kTfLiteOk)
433 if (op_resolver.AddFullyConnected() != kTfLiteOk)
435 if (op_resolver.AddLogistic() != kTfLiteOk)
437 if (op_resolver.AddQuantize() != kTfLiteOk)
439 if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk)
441 if (op_resolver.AddAveragePool2D() != kTfLiteOk)
443 if (op_resolver.AddMaxPool2D() != kTfLiteOk)
445 if (op_resolver.AddPad() != kTfLiteOk)
447 if (op_resolver.AddPack() != kTfLiteOk)
449 if (op_resolver.AddSplitV() != kTfLiteOk)
458 #endif // USE_ESP_IDF
void add_wake_word_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size)
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
int16_t * preprocessor_audio_buffer_
void set_state_(State state)
Trigger< std::string > * wake_word_detected_trigger_
bool detect_wake_words_()
Checks every model's recent probabilities to determine if the wake word has been predicted.
std::unique_ptr< RingBuffer > ring_buffer_
An STL allocator that uses SPI RAM.
float get_setup_priority() const override
void deallocate(T *p, size_t n)
HighFrequencyLoopRequester high_freq_
struct FrontendConfig frontend_config_
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
bool status_has_error() const
std::vector< WakeWordModel > wake_word_models_
void update_model_probabilities_()
Performs inference with each configured model.
void dump_config() override
void status_set_error(const char *message="unspecified")
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Returns true if successfully registered the streaming model's TensorFlow operations.
uint8_t features_step_size_
microphone::Microphone * microphone_
void start()
Start running the loop continuously.
bool allocate_buffers_()
Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_. ...
bool has_enough_samples_()
Tests if there are enough samples in the ring buffer to generate new features.
tflite::MicroMutableOpResolver< 20 > streaming_op_resolver_
bool generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE])
Generates features for a window of audio samples.
void deallocate_buffers_()
Frees memory allocated for input_buffer_ and preprocessor_audio_buffer_.
void stop()
Stop running the loop continuously.
uint16_t new_samples_to_get_()
struct FrontendState frontend_state_
void reset_states_()
Resets the ring buffer, ignore_windows_, and sliding window probabilities.
void status_clear_error()
virtual size_t read(int16_t *buf, size_t len)=0
virtual void mark_failed()
Mark this component as failed.
Implementation of SPI Controller mode.
std::string detected_wake_word_
void add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
bool load_models_()
Loads streaming models and prepares the feature generation frontend.
static std::unique_ptr< RingBuffer > create(size_t len)
std::unique_ptr< VADModel > vad_model_
void unload_models_()
Deletes each model's TFLite interpreters and frees tensor arena memory.
size_t read_microphone_()
Reads audio from microphone into the ring buffer.