You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Today I spent a bit of time on implementing a C++ version of this library that can run the "First iteration siamese" model using TFLite/LiteRT. I'm providing it here in case someone else needs it.
main.cpp
#include<cstdio>
#include<vector>
#include<tensorflow/lite/interpreter.h>
#include<tensorflow/lite/kernels/register.h>
#include<tensorflow/lite/model.h>
#include<tensorflow/lite/optional_debug_tools.h>using f32 = float;
#include"embeddings.h"classHotwordDetector {
std::unique_ptr<tflite::FlatBufferModel> lmc;
std::unique_ptr<tflite::FlatBufferModel> bm;
std::unique_ptr<tflite::Interpreter> lmcInterpreter;
int lmcInputIndex;
int lmcOutputIndex;
std::unique_ptr<tflite::Interpreter> bmInterpreter;
int bmInputIndex;
int bmOutputIndex;
std::vector<std::array<f32, 128>> embeddings;
public:HotwordDetector()
{
}
intinit(
constchar* logMelCalcPath,
constchar* baseModelPath,
std::vector<std::array<f32, 128>> newEmbeddings
) {
lmc = tflite::FlatBufferModel::BuildFromFile(logMelCalcPath);
if (lmc == nullptr) {
return2 + 0;
}
bm = tflite::FlatBufferModel::BuildFromFile(baseModelPath);
if (bm == nullptr) {
return2 + 1;
}
tflite::ops::builtin::BuiltinOpResolver lmcResolver;
tflite::InterpreterBuilder lmcInterpreterBuilder(*lmc, lmcResolver);
lmcInterpreterBuilder(&lmcInterpreter);
if (lmcInterpreter == nullptr) {
return4 + 0;
}
if (lmcInterpreter->AllocateTensors() != kTfLiteOk) {
return6 + 0;
}
tflite::ops::builtin::BuiltinOpResolver bmResolver;
tflite::InterpreterBuilder bmInterpreterBuilder(*bm, bmResolver);
bmInterpreterBuilder(&bmInterpreter);
if (bmInterpreter == nullptr) {
return4 + 1;
}
if (bmInterpreter->AllocateTensors() != kTfLiteOk) {
return6 + 1;
}
if (newEmbeddings.size() <= 4) {
return8;
}
embeddings = newEmbeddings;
return0;
}
f32 process_frame(std::array<f32, 16000> audio) {
// TODO implement silence check// normalize the input
f32 max_val = *std::max_element(audio.begin(), audio.end());
std::transform(audio.begin(), audio.end(), audio.begin(),
[max_val](f32 x) { return x / max_val; });
// set the logMelCalc model input
f32* lmcInputTensor = lmcInterpreter->typed_input_tensor<f32>(0);
std::copy(audio.begin(), audio.end(), lmcInputTensor);
// invoke the logMelCalc model
lmcInterpreter->Invoke();
// obtain the logMelCalc output and set the baseModel input
f32* lmcOutputTensor = lmcInterpreter->typed_output_tensor<f32>(0);
f32* bmInputTensor = bmInterpreter->typed_input_tensor<f32>(0);
std::copy(lmcOutputTensor, lmcOutputTensor + 98*64, bmInputTensor);
// invoke the baseModel model
bmInterpreter->Invoke();
// obtain the baseModel output (i.e. the calculated embeddings)
f32* bmOutputTensor = bmInterpreter->typed_output_tensor<f32>(0);
std::array<f32, 128> bmOutput;
std::copy(bmOutputTensor, bmOutputTensor + bmOutput.size(), bmOutput.begin());
// calculate distances from embeddings
std::vector<f32> distances;
for (constauto& embedding : embeddings) {
f32 sum = 0.0;
for (size_t i = 0; i < 128; ++i) {
f32 diff = bmOutputTensor[i] - embedding[i];
sum += diff * diff;
}
distances.push_back(std::sqrt(sum));
}
// find the best three distancesstd::partial_sort(distances.begin(), distances.begin() + 3, distances.end());
// calculate the final score
f32 out = 0.0f;
for (int i = 0; i < 3; ++i) {
f32 distance = distances[i];
distance = std::min(0.3f, distance);
distance = (0.3f - distance) / 0.3f;
out += (1-out) * distance;
}
return out;
}
};
intmain(int argc, char* argv[]) {
HotwordDetector hd;
int ok = hd.init("../logmelcalc.tflite", "../baseModel.tflite", embeddings_alexa);
if (ok != 0) {
printf("Error %d\n", ok);
} else {
printf("Initialized correctly\n");
}
FILE* audioFile = fopen("../test_audio_float.raw", "rb");
std::vector<float> audioData;
while (true) {
float sample;
if (fread(&sample, 1, 4, audioFile) == 4) {
audioData.push_back(sample);
} else {
break;
}
}
printf("Audio file: %d samples", audioData.size());
for (int i = 0; i < audioData.size() - 16000; i += 1000) {
std::array<f32, 16000> data;
std::copy(audioData.begin() + i, audioData.begin() + i + 16000, data.begin());
f32 confidence = hd.process_frame(data);
printf("Confidence %.2fs %f%s\n", i / 16000.0f, confidence, confidence > 0.9f ? " DETECTED!" : "");
}
}
embeddings.h (data is from alexa_ref.json in the commit before the ResNet50 model was added)
Today I spent a bit of time on implementing a C++ version of this library that can run the "First iteration siamese" model using TFLite/LiteRT. I'm providing it here in case someone else needs it.
main.cpp
embeddings.h
(data is from alexa_ref.json in the commit before the ResNet50 model was added)The command to generate the
test_audio_float.raw
file isarecord -D plughw:1,0 -f FLOAT_LE -c1 -r16000 -d1 test_audio_float.raw
.The text was updated successfully, but these errors were encountered: