ESPHome  2024.9.0
streaming_model.cpp
Go to the documentation of this file.
1 #ifdef USE_ESP_IDF
2 
3 #include "streaming_model.h"
4 
5 #include "esphome/core/hal.h"
6 #include "esphome/core/helpers.h"
7 #include "esphome/core/log.h"
8 
9 static const char *const TAG = "micro_wake_word";
10 
11 namespace esphome {
12 namespace micro_wake_word {
13 
15  ESP_LOGCONFIG(TAG, " - Wake Word: %s", this->wake_word_.c_str());
16  ESP_LOGCONFIG(TAG, " Probability cutoff: %.3f", this->probability_cutoff_);
17  ESP_LOGCONFIG(TAG, " Sliding window size: %d", this->sliding_window_size_);
18 }
19 
21  ESP_LOGCONFIG(TAG, " - VAD Model");
22  ESP_LOGCONFIG(TAG, " Probability cutoff: %.3f", this->probability_cutoff_);
23  ESP_LOGCONFIG(TAG, " Sliding window size: %d", this->sliding_window_size_);
24 }
25 
26 bool StreamingModel::load_model(tflite::MicroMutableOpResolver<20> &op_resolver) {
28 
29  if (this->tensor_arena_ == nullptr) {
30  this->tensor_arena_ = arena_allocator.allocate(this->tensor_arena_size_);
31  if (this->tensor_arena_ == nullptr) {
32  ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");
33  return false;
34  }
35  }
36 
37  if (this->var_arena_ == nullptr) {
38  this->var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);
39  if (this->var_arena_ == nullptr) {
40  ESP_LOGE(TAG, "Could not allocate the streaming model's variable tensor arena.");
41  return false;
42  }
43  this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
44  this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);
45  }
46 
47  const tflite::Model *model = tflite::GetModel(this->model_start_);
48  if (model->version() != TFLITE_SCHEMA_VERSION) {
49  ESP_LOGE(TAG, "Streaming model's schema is not supported");
50  return false;
51  }
52 
53  if (this->interpreter_ == nullptr) {
54  this->interpreter_ = make_unique<tflite::MicroInterpreter>(
55  tflite::GetModel(this->model_start_), op_resolver, this->tensor_arena_, this->tensor_arena_size_, this->mrv_);
56  if (this->interpreter_->AllocateTensors() != kTfLiteOk) {
57  ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");
58  return false;
59  }
60 
61  // Verify input tensor matches expected values
62  // Dimension 3 will represent the first layer stride, so skip it may vary
63  TfLiteTensor *input = this->interpreter_->input(0);
64  if ((input->dims->size != 3) || (input->dims->data[0] != 1) ||
65  (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
66  ESP_LOGE(TAG, "Streaming model tensor input dimensions has improper dimensions.");
67  return false;
68  }
69 
70  if (input->type != kTfLiteInt8) {
71  ESP_LOGE(TAG, "Streaming model tensor input is not int8.");
72  return false;
73  }
74 
75  // Verify output tensor matches expected values
76  TfLiteTensor *output = this->interpreter_->output(0);
77  if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {
78  ESP_LOGE(TAG, "Streaming model tensor output dimension is not 1x1.");
79  }
80 
81  if (output->type != kTfLiteUInt8) {
82  ESP_LOGE(TAG, "Streaming model tensor output is not uint8.");
83  return false;
84  }
85  }
86 
87  return true;
88 }
89 
91  this->interpreter_.reset();
92 
94 
95  arena_allocator.deallocate(this->tensor_arena_, this->tensor_arena_size_);
96  this->tensor_arena_ = nullptr;
97  arena_allocator.deallocate(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
98  this->var_arena_ = nullptr;
99 }
100 
101 bool StreamingModel::perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
102  if (this->interpreter_ != nullptr) {
103  TfLiteTensor *input = this->interpreter_->input(0);
104 
105  std::memmove(
106  (int8_t *) (tflite::GetTensorData<int8_t>(input)) + PREPROCESSOR_FEATURE_SIZE * this->current_stride_step_,
107  features, PREPROCESSOR_FEATURE_SIZE);
108  ++this->current_stride_step_;
109 
110  uint8_t stride = this->interpreter_->input(0)->dims->data[1];
111 
112  if (this->current_stride_step_ >= stride) {
113  this->current_stride_step_ = 0;
114 
115  TfLiteStatus invoke_status = this->interpreter_->Invoke();
116  if (invoke_status != kTfLiteOk) {
117  ESP_LOGW(TAG, "Streaming interpreter invoke failed");
118  return false;
119  }
120 
121  TfLiteTensor *output = this->interpreter_->output(0);
122 
123  ++this->last_n_index_;
124  if (this->last_n_index_ == this->sliding_window_size_)
125  this->last_n_index_ = 0;
126  this->recent_streaming_probabilities_[this->last_n_index_] = output->data.uint8[0]; // probability;
127  }
128  return true;
129  }
130  ESP_LOGE(TAG, "Streaming interpreter is not initialized.");
131  return false;
132 }
133 
135  for (auto &prob : this->recent_streaming_probabilities_) {
136  prob = 0;
137  }
138 }
139 
140 WakeWordModel::WakeWordModel(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size,
141  const std::string &wake_word, size_t tensor_arena_size) {
142  this->model_start_ = model_start;
143  this->probability_cutoff_ = probability_cutoff;
144  this->sliding_window_size_ = sliding_window_average_size;
145  this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0);
146  this->wake_word_ = wake_word;
147  this->tensor_arena_size_ = tensor_arena_size;
148 };
149 
151  uint32_t sum = 0;
152  for (auto &prob : this->recent_streaming_probabilities_) {
153  sum += prob;
154  }
155 
156  float sliding_window_average = static_cast<float>(sum) / static_cast<float>(255 * this->sliding_window_size_);
157 
158  // Detect the wake word if the sliding window average is above the cutoff
159  if (sliding_window_average > this->probability_cutoff_) {
160  ESP_LOGD(TAG, "The '%s' model sliding average probability is %.3f and most recent probability is %.3f",
161  this->wake_word_.c_str(), sliding_window_average,
162  this->recent_streaming_probabilities_[this->last_n_index_] / (255.0));
163  return true;
164  }
165  return false;
166 }
167 
168 VADModel::VADModel(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
169  size_t tensor_arena_size) {
170  this->model_start_ = model_start;
171  this->probability_cutoff_ = probability_cutoff;
172  this->sliding_window_size_ = sliding_window_size;
173  this->recent_streaming_probabilities_.resize(sliding_window_size, 0);
174  this->tensor_arena_size_ = tensor_arena_size;
175 };
176 
178  uint32_t sum = 0;
179  for (auto &prob : this->recent_streaming_probabilities_) {
180  sum += prob;
181  }
182 
183  float sliding_window_average = static_cast<float>(sum) / static_cast<float>(255 * this->sliding_window_size_);
184 
185  return sliding_window_average > this->probability_cutoff_;
186 }
187 
188 } // namespace micro_wake_word
189 } // namespace esphome
190 
191 #endif
tflite::MicroResourceVariables * mrv_
void unload_model()
Destroys the TFLite interpreter and frees the tensor and variable arenas&#39; memory. ...
std::vector< uint8_t > recent_streaming_probabilities_
void deallocate(T *p, size_t n)
Definition: helpers.h:678
VADModel(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
bool load_model(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Allocates tensor and variable arenas and sets up the model interpreter.
void reset_probabilities()
Sets all recent_streaming_probabilities to 0.
std::unique_ptr< tflite::MicroInterpreter > interpreter_
bool perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE])
bool determine_detected() override
Checks for voice activity by comparing the max probability in the sliding window with the probability...
WakeWordModel(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size)
bool determine_detected() override
Checks for the wake word by comparing the mean probability in the sliding window with the probability...
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7