blob: 5c1d0e0b7cc452cc998a28255c5a07954d478edc [file] [log] [blame]
alexander3c798932021-03-26 21:42:19 +00001/*
2 * Copyright (c) 2021 Arm Limited. All rights reserved.
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17#include "hal.h" /* Brings in platform definitions. */
18#include "InputFiles.hpp" /* For input images. */
Kshitij Sisodia76a15802021-12-24 11:05:11 +000019#include "Labels_micronetkws.hpp" /* For MicroNetKws label strings. */
alexander3c798932021-03-26 21:42:19 +000020#include "Labels_wav2letter.hpp" /* For Wav2Letter label strings. */
21#include "Classifier.hpp" /* KWS classifier. */
22#include "AsrClassifier.hpp" /* ASR classifier. */
Kshitij Sisodia76a15802021-12-24 11:05:11 +000023#include "MicroNetKwsModel.hpp" /* KWS model class for running inference. */
alexander3c798932021-03-26 21:42:19 +000024#include "Wav2LetterModel.hpp" /* ASR model class for running inference. */
25#include "UseCaseCommonUtils.hpp" /* Utils functions. */
26#include "UseCaseHandler.hpp" /* Handlers for different user options. */
27#include "Wav2LetterPreprocess.hpp" /* ASR pre-processing class. */
28#include "Wav2LetterPostprocess.hpp"/* ASR post-processing class. */
alexander31ae9f02022-02-10 16:15:54 +000029#include "log_macros.h"
alexander3c798932021-03-26 21:42:19 +000030
31using KwsClassifier = arm::app::Classifier;
32
33enum opcodes
34{
35 MENU_OPT_RUN_INF_NEXT = 1, /* Run on next vector. */
36 MENU_OPT_RUN_INF_CHOSEN, /* Run on a user provided vector index. */
37 MENU_OPT_RUN_INF_ALL, /* Run inference on all. */
38 MENU_OPT_SHOW_MODEL_INFO, /* Show model info. */
39 MENU_OPT_LIST_AUDIO_CLIPS /* List the current baked audio clips. */
40};
41
42static void DisplayMenu()
43{
Kshitij Sisodia3c8256d2021-05-24 16:12:40 +010044 printf("\n\n");
45 printf("User input required\n");
alexander3c798932021-03-26 21:42:19 +000046 printf("Enter option number from:\n\n");
47 printf(" %u. Classify next audio clip\n", MENU_OPT_RUN_INF_NEXT);
48 printf(" %u. Classify audio clip at chosen index\n", MENU_OPT_RUN_INF_CHOSEN);
49 printf(" %u. Run classification on all audio clips\n", MENU_OPT_RUN_INF_ALL);
50 printf(" %u. Show NN model info\n", MENU_OPT_SHOW_MODEL_INFO);
51 printf(" %u. List audio clips\n\n", MENU_OPT_LIST_AUDIO_CLIPS);
52 printf(" Choice: ");
George Gekov93e59512021-08-03 11:18:41 +010053 fflush(stdout);
alexander3c798932021-03-26 21:42:19 +000054}
55
56/** @brief Gets the number of MFCC features for a single window. */
57static uint32_t GetNumMfccFeatures(const arm::app::Model& model);
58
59/** @brief Gets the number of MFCC feature vectors to be computed. */
60static uint32_t GetNumMfccFeatureVectors(const arm::app::Model& model);
61
62/** @brief Gets the output context length (left and right) for post-processing. */
63static uint32_t GetOutputContextLen(const arm::app::Model& model,
64 uint32_t inputCtxLen);
65
66/** @brief Gets the output inner length for post-processing. */
67static uint32_t GetOutputInnerLen(const arm::app::Model& model,
68 uint32_t outputCtxLen);
69
Kshitij Sisodia4cc40212022-04-08 09:54:53 +010070void main_loop()
alexander3c798932021-03-26 21:42:19 +000071{
72 /* Model wrapper objects. */
Kshitij Sisodia76a15802021-12-24 11:05:11 +000073 arm::app::MicroNetKwsModel kwsModel;
alexander3c798932021-03-26 21:42:19 +000074 arm::app::Wav2LetterModel asrModel;
75
76 /* Load the models. */
77 if (!kwsModel.Init()) {
78 printf_err("Failed to initialise KWS model\n");
79 return;
80 }
81
82 /* Initialise the asr model using the same allocator from KWS
83 * to re-use the tensor arena. */
84 if (!asrModel.Init(kwsModel.GetAllocator())) {
Kshitij Sisodia76a15802021-12-24 11:05:11 +000085 printf_err("Failed to initialise ASR model\n");
alexander3c798932021-03-26 21:42:19 +000086 return;
87 }
88
89 /* Initialise ASR pre-processing. */
90 arm::app::audio::asr::Preprocess prep(
91 GetNumMfccFeatures(asrModel),
92 arm::app::asr::g_FrameLength,
93 arm::app::asr::g_FrameStride,
94 GetNumMfccFeatureVectors(asrModel));
95
96 /* Initialise ASR post-processing. */
97 const uint32_t outputCtxLen = GetOutputContextLen(asrModel, arm::app::asr::g_ctxLen);
98 const uint32_t blankTokenIdx = 28;
99 arm::app::audio::asr::Postprocess postp(
100 outputCtxLen,
101 GetOutputInnerLen(asrModel, outputCtxLen),
102 blankTokenIdx);
103
104 /* Instantiate application context. */
105 arm::app::ApplicationContext caseContext;
106
Kshitij Sisodia4cc40212022-04-08 09:54:53 +0100107 arm::app::Profiler profiler{"kws_asr"};
Isabella Gottardi8df12f32021-04-07 17:15:31 +0100108 caseContext.Set<arm::app::Profiler&>("profiler", profiler);
alexander3c798932021-03-26 21:42:19 +0000109 caseContext.Set<arm::app::Model&>("kwsmodel", kwsModel);
110 caseContext.Set<arm::app::Model&>("asrmodel", asrModel);
111 caseContext.Set<uint32_t>("clipIndex", 0);
112 caseContext.Set<uint32_t>("ctxLen", arm::app::asr::g_ctxLen); /* Left and right context length (MFCC feat vectors). */
113 caseContext.Set<int>("kwsframeLength", arm::app::kws::g_FrameLength);
114 caseContext.Set<int>("kwsframeStride", arm::app::kws::g_FrameStride);
115 caseContext.Set<float>("kwsscoreThreshold", arm::app::kws::g_ScoreThreshold); /* Normalised score threshold. */
116 caseContext.Set<uint32_t >("kwsNumMfcc", arm::app::kws::g_NumMfcc);
117 caseContext.Set<uint32_t >("kwsNumAudioWins", arm::app::kws::g_NumAudioWins);
118
119 caseContext.Set<int>("asrframeLength", arm::app::asr::g_FrameLength);
120 caseContext.Set<int>("asrframeStride", arm::app::asr::g_FrameStride);
121 caseContext.Set<float>("asrscoreThreshold", arm::app::asr::g_ScoreThreshold); /* Normalised score threshold. */
122
123 KwsClassifier kwsClassifier; /* Classifier wrapper object. */
124 arm::app::AsrClassifier asrClassifier; /* Classifier wrapper object. */
125 caseContext.Set<arm::app::Classifier&>("kwsclassifier", kwsClassifier);
126 caseContext.Set<arm::app::AsrClassifier&>("asrclassifier", asrClassifier);
127
128 caseContext.Set<arm::app::audio::asr::Preprocess&>("preprocess", prep);
129 caseContext.Set<arm::app::audio::asr::Postprocess&>("postprocess", postp);
130
131 std::vector<std::string> asrLabels;
132 arm::app::asr::GetLabelsVector(asrLabels);
133 std::vector<std::string> kwsLabels;
134 arm::app::kws::GetLabelsVector(kwsLabels);
135 caseContext.Set<const std::vector <std::string>&>("asrlabels", asrLabels);
136 caseContext.Set<const std::vector <std::string>&>("kwslabels", kwsLabels);
137
Liam Barryb5b32d32021-12-30 11:35:00 +0000138 /* KWS keyword that triggers ASR and associated checks */
139 std::string triggerKeyword = std::string("yes");
140 if (std::find(kwsLabels.begin(), kwsLabels.end(), triggerKeyword) != kwsLabels.end()) {
141 caseContext.Set<const std::string &>("triggerkeyword", triggerKeyword);
142 }
143 else {
144 printf_err("Selected trigger keyword not found in labels file\n");
145 return;
146 }
alexander3c798932021-03-26 21:42:19 +0000147
148 /* Loop. */
149 bool executionSuccessful = true;
150 constexpr bool bUseMenu = NUMBER_OF_FILES > 1 ? true : false;
151
152 /* Loop. */
153 do {
154 int menuOption = MENU_OPT_RUN_INF_NEXT;
155 if (bUseMenu) {
156 DisplayMenu();
Kshitij Sisodia68fdd112022-04-06 13:03:20 +0100157 menuOption = arm::app::ReadUserInputAsInt();
alexander3c798932021-03-26 21:42:19 +0000158 printf("\n");
159 }
160 switch (menuOption) {
161 case MENU_OPT_RUN_INF_NEXT:
162 executionSuccessful = ClassifyAudioHandler(
163 caseContext,
164 caseContext.Get<uint32_t>("clipIndex"),
165 false);
166 break;
167 case MENU_OPT_RUN_INF_CHOSEN: {
168 printf(" Enter the audio clip index [0, %d]: ",
169 NUMBER_OF_FILES-1);
Isabella Gottardi79d41542021-10-20 15:52:32 +0100170 fflush(stdout);
alexander3c798932021-03-26 21:42:19 +0000171 auto clipIndex = static_cast<uint32_t>(
Kshitij Sisodia68fdd112022-04-06 13:03:20 +0100172 arm::app::ReadUserInputAsInt());
alexander3c798932021-03-26 21:42:19 +0000173 executionSuccessful = ClassifyAudioHandler(caseContext,
174 clipIndex,
175 false);
176 break;
177 }
178 case MENU_OPT_RUN_INF_ALL:
179 executionSuccessful = ClassifyAudioHandler(
180 caseContext,
181 caseContext.Get<uint32_t>("clipIndex"),
182 true);
183 break;
184 case MENU_OPT_SHOW_MODEL_INFO:
185 executionSuccessful = kwsModel.ShowModelInfoHandler();
186 executionSuccessful = asrModel.ShowModelInfoHandler();
187 break;
188 case MENU_OPT_LIST_AUDIO_CLIPS:
189 executionSuccessful = ListFilesHandler(caseContext);
190 break;
191 default:
192 printf("Incorrect choice, try again.");
193 break;
194 }
195 } while (executionSuccessful && bUseMenu);
196 info("Main loop terminated.\n");
197}
198
199static uint32_t GetNumMfccFeatures(const arm::app::Model& model)
200{
201 TfLiteTensor* inputTensor = model.GetInputTensor(0);
202 const int inputCols = inputTensor->dims->data[arm::app::Wav2LetterModel::ms_inputColsIdx];
203 if (0 != inputCols % 3) {
204 printf_err("Number of input columns is not a multiple of 3\n");
205 }
206 return std::max(inputCols/3, 0);
207}
208
209static uint32_t GetNumMfccFeatureVectors(const arm::app::Model& model)
210{
211 TfLiteTensor* inputTensor = model.GetInputTensor(0);
212 const int inputRows = inputTensor->dims->data[arm::app::Wav2LetterModel::ms_inputRowsIdx];
213 return std::max(inputRows, 0);
214}
215
216static uint32_t GetOutputContextLen(const arm::app::Model& model, const uint32_t inputCtxLen)
217{
218 const uint32_t inputRows = GetNumMfccFeatureVectors(model);
219 const uint32_t inputInnerLen = inputRows - (2 * inputCtxLen);
220 constexpr uint32_t ms_outputRowsIdx = arm::app::Wav2LetterModel::ms_outputRowsIdx;
221
222 /* Check to make sure that the input tensor supports the above context and inner lengths. */
223 if (inputRows <= 2 * inputCtxLen || inputRows <= inputInnerLen) {
Kshitij Sisodiaf9c19ea2021-05-07 16:08:14 +0100224 printf_err("Input rows not compatible with ctx of %" PRIu32 "\n",
alexander3c798932021-03-26 21:42:19 +0000225 inputCtxLen);
226 return 0;
227 }
228
229 TfLiteTensor* outputTensor = model.GetOutputTensor(0);
230 const uint32_t outputRows = std::max(outputTensor->dims->data[ms_outputRowsIdx], 0);
231
232 const float tensorColRatio = static_cast<float>(inputRows)/
233 static_cast<float>(outputRows);
234
235 return std::round(static_cast<float>(inputCtxLen)/tensorColRatio);
236}
237
238static uint32_t GetOutputInnerLen(const arm::app::Model& model,
239 const uint32_t outputCtxLen)
240{
241 constexpr uint32_t ms_outputRowsIdx = arm::app::Wav2LetterModel::ms_outputRowsIdx;
242 TfLiteTensor* outputTensor = model.GetOutputTensor(0);
243 const uint32_t outputRows = std::max(outputTensor->dims->data[ms_outputRowsIdx], 0);
244 return (outputRows - (2 * outputCtxLen));
245}