Blame - source/use_case/kws/src/UseCaseHandler.cc - ml/ethos-u/ml-embedded-evaluation-kit

blob: d2cba55e13110d1e2ec3d000fd0bdf4e59bc3c70 [file] [log] [blame]

alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited. All rights reserved.
				3	* SPDX-License-Identifier: Apache-2.0
				4	*
				5	* Licensed under the Apache License, Version 2.0 (the "License");
				6	* you may not use this file except in compliance with the License.
				7	* You may obtain a copy of the License at
				8	*
				9	* http://www.apache.org/licenses/LICENSE-2.0
				10	*
				11	* Unless required by applicable law or agreed to in writing, software
				12	* distributed under the License is distributed on an "AS IS" BASIS,
				13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	* See the License for the specific language governing permissions and
				15	* limitations under the License.
				16	*/
				17	#include "UseCaseHandler.hpp"
				18
				19	#include "InputFiles.hpp"
				20	#include "Classifier.hpp"
				21	#include "DsCnnModel.hpp"
				22	#include "hal.h"
				23	#include "DsCnnMfcc.hpp"
				24	#include "AudioUtils.hpp"
				25	#include "UseCaseCommonUtils.hpp"
				26	#include "KwsResult.hpp"
				27
				28	#include <vector>
				29	#include <functional>
				30
				31	using KwsClassifier = arm::app::Classifier;
				32
				33	namespace arm {
				34	namespace app {
				35
				36	/**
				37	* @brief Helper function to increment current audio clip index.
				38	* @param[in,out] ctx Pointer to the application context object.
				39	**/
				40	static void _IncrementAppCtxClipIdx(ApplicationContext& ctx);
				41
				42	/**
				43	* @brief Helper function to set the audio clip index.
				44	* @param[in,out] ctx Pointer to the application context object.
				45	* @param[in] idx Value to be set.
				46	* @return true if index is set, false otherwise.
				47	**/
				48	static bool _SetAppCtxClipIdx(ApplicationContext& ctx, uint32_t idx);
				49
				50	/**
				51	* @brief Presents inference results using the data presentation
				52	* object.
				53	* @param[in] platform Reference to the hal platform object.
				54	* @param[in] results Vector of classification results to be displayed.
				55	* @param[in] infTimeMs Inference time in milliseconds, if available,
				56	* otherwise, this can be passed in as 0.
				57	* @return true if successful, false otherwise.
				58	**/
				59	static bool _PresentInferenceResult(hal_platform& platform,
				60	const std::vector<arm::app::kws::KwsResult>& results);
				61
				62	/**
				63	* @brief Returns a function to perform feature calculation and populates input tensor data with
				64	* MFCC data.
				65	*
				66	* Input tensor data type check is performed to choose correct MFCC feature data type.
				67	* If tensor has an integer data type then original features are quantised.
				68	*
				69	* Warning: MFCC calculator provided as input must have the same life scope as returned function.
				70	*
				71	* @param[in] mfcc MFCC feature calculator.
				72	* @param[in,out] inputTensor Input tensor pointer to store calculated features.
				73	* @param[in] cacheSize Size of the feature vectors cache (number of feature vectors).
				74	* @return Function to be called providing audio sample and sliding window index.
				75	*/
				76	static std::function<void (std::vector<int16_t>&, int, bool, size_t)>
				77	GetFeatureCalculator(audio::DsCnnMFCC& mfcc,
				78	TfLiteTensor* inputTensor,
				79	size_t cacheSize);
				80
				81	/* Audio inference handler. */
				82	bool ClassifyAudioHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll)
				83	{
				84	auto& platform = ctx.Get<hal_platform&>("platform");
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	85	auto& profiler = ctx.Get<Profiler&>("profiler");
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	86
				87	constexpr uint32_t dataPsnTxtInfStartX = 20;
				88	constexpr uint32_t dataPsnTxtInfStartY = 40;
				89	constexpr int minTensorDims = static_cast<int>(
				90	(arm::app::DsCnnModel::ms_inputRowsIdx > arm::app::DsCnnModel::ms_inputColsIdx)?
				91	arm::app::DsCnnModel::ms_inputRowsIdx : arm::app::DsCnnModel::ms_inputColsIdx);
				92
				93	platform.data_psn->clear(COLOR_BLACK);
				94
				95	auto& model = ctx.Get<Model&>("model");
				96
				97	/* If the request has a valid size, set the audio index. */
				98	if (clipIndex < NUMBER_OF_FILES) {
				99	if (!_SetAppCtxClipIdx(ctx, clipIndex)) {
				100	return false;
				101	}
				102	}
				103	if (!model.IsInited()) {
				104	printf_err("Model is not initialised! Terminating processing.\n");
				105	return false;
				106	}
				107
				108	const auto frameLength = ctx.Get<int>("frameLength");
				109	const auto frameStride = ctx.Get<int>("frameStride");
				110	const auto scoreThreshold = ctx.Get<float>("scoreThreshold");
				111	auto startClipIdx = ctx.Get<uint32_t>("clipIndex");
				112
				113	TfLiteTensor* outputTensor = model.GetOutputTensor(0);
				114	TfLiteTensor* inputTensor = model.GetInputTensor(0);
				115
				116	if (!inputTensor->dims) {
				117	printf_err("Invalid input tensor dims\n");
				118	return false;
				119	} else if (inputTensor->dims->size < minTensorDims) {
				120	printf_err("Input tensor dimension should be >= %d\n", minTensorDims);
				121	return false;
				122	}
				123
				124	TfLiteIntArray* inputShape = model.GetInputShape(0);
				125	const uint32_t kNumCols = inputShape->data[arm::app::DsCnnModel::ms_inputColsIdx];
				126	const uint32_t kNumRows = inputShape->data[arm::app::DsCnnModel::ms_inputRowsIdx];
				127
				128	audio::DsCnnMFCC mfcc = audio::DsCnnMFCC(kNumCols, frameLength);
				129	mfcc.Init();
				130
				131	/* Deduce the data length required for 1 inference from the network parameters. */
				132	auto audioDataWindowSize = kNumRows * frameStride + (frameLength - frameStride);
				133	auto mfccWindowSize = frameLength;
				134	auto mfccWindowStride = frameStride;
				135
				136	/* We choose to move by half the window size => for a 1 second window size
				137	* there is an overlap of 0.5 seconds. */
				138	auto audioDataStride = audioDataWindowSize / 2;
				139
				140	/* To have the previously calculated features re-usable, stride must be multiple
				141	* of MFCC features window stride. */
				142	if (0 != audioDataStride % mfccWindowStride) {
				143
				144	/* Reduce the stride. */
				145	audioDataStride -= audioDataStride % mfccWindowStride;
				146	}
				147
				148	auto nMfccVectorsInAudioStride = audioDataStride/mfccWindowStride;
				149
				150	/* We expect to be sampling 1 second worth of data at a time.
				151	* NOTE: This is only used for time stamp calculation. */
				152	const float secondsPerSample = 1.0/audio::DsCnnMFCC::ms_defaultSamplingFreq;
				153
				154	do {
				155	auto currentIndex = ctx.Get<uint32_t>("clipIndex");
				156
				157	/* Creating a mfcc features sliding window for the data required for 1 inference. */
				158	auto audioMFCCWindowSlider = audio::SlidingWindow<const int16_t>(
				159	get_audio_array(currentIndex),
				160	audioDataWindowSize, mfccWindowSize,
				161	mfccWindowStride);
				162
				163	/* Creating a sliding window through the whole audio clip. */
				164	auto audioDataSlider = audio::SlidingWindow<const int16_t>(
				165	get_audio_array(currentIndex),
				166	get_audio_array_size(currentIndex),
				167	audioDataWindowSize, audioDataStride);
				168
				169	/* Calculate number of the feature vectors in the window overlap region.
				170	* These feature vectors will be reused.*/
				171	auto numberOfReusedFeatureVectors = audioMFCCWindowSlider.TotalStrides() + 1
				172	- nMfccVectorsInAudioStride;
				173
				174	/* Construct feature calculation function. */
				175	auto mfccFeatureCalc = GetFeatureCalculator(mfcc, inputTensor,
				176	numberOfReusedFeatureVectors);
				177
				178	if (!mfccFeatureCalc){
				179	return false;
				180	}
				181
				182	/* Declare a container for results. */
				183	std::vector<arm::app::kws::KwsResult> results;
				184
				185	/* Display message on the LCD - inference running. */
				186	std::string str_inf{"Running inference... "};
				187	platform.data_psn->present_data_text(
				188	str_inf.c_str(), str_inf.size(),
				189	dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0);
				190	info("Running inference on audio clip %u => %s\n", currentIndex,
				191	get_filename(currentIndex));
				192
				193	/* Start sliding through audio clip. */
				194	while (audioDataSlider.HasNext()) {
				195	const int16_t *inferenceWindow = audioDataSlider.Next();
				196
				197	/* We moved to the next window - set the features sliding to the new address. */
				198	audioMFCCWindowSlider.Reset(inferenceWindow);
				199
				200	/* The first window does not have cache ready. */
				201	bool useCache = audioDataSlider.Index() > 0 && numberOfReusedFeatureVectors > 0;
				202
				203	/* Start calculating features inside one audio sliding window. */
				204	while (audioMFCCWindowSlider.HasNext()) {
				205	const int16_t *mfccWindow = audioMFCCWindowSlider.Next();
				206	std::vector<int16_t> mfccAudioData = std::vector<int16_t>(mfccWindow,
				207	mfccWindow + mfccWindowSize);
				208	/* Compute features for this window and write them to input tensor. */
				209	mfccFeatureCalc(mfccAudioData,
				210	audioMFCCWindowSlider.Index(),
				211	useCache,
				212	nMfccVectorsInAudioStride);
				213	}
				214
				215	info("Inference %zu/%zu\n", audioDataSlider.Index() + 1,
				216	audioDataSlider.TotalStrides() + 1);
				217
				218	/* Run inference over this audio clip sliding window. */
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	219	arm::app::RunInference(model, profiler);
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	220
				221	std::vector<ClassificationResult> classificationResult;
				222	auto& classifier = ctx.Get<KwsClassifier&>("classifier");
				223	classifier.GetClassificationResults(outputTensor, classificationResult,
				224	ctx.Get<std::vector<std::string>&>("labels"), 1);
				225
				226	results.emplace_back(kws::KwsResult(classificationResult,
				227	audioDataSlider.Index() * secondsPerSample * audioDataStride,
				228	audioDataSlider.Index(), scoreThreshold));
				229
				230	#if VERIFY_TEST_OUTPUT
				231	arm::app::DumpTensor(outputTensor);
				232	#endif /* VERIFY_TEST_OUTPUT */
				233	} /* while (audioDataSlider.HasNext()) */
				234
				235	/* Erase. */
				236	str_inf = std::string(str_inf.size(), ' ');
				237	platform.data_psn->present_data_text(
				238	str_inf.c_str(), str_inf.size(),
				239	dataPsnTxtInfStartX, dataPsnTxtInfStartY, false);
				240
				241	ctx.Set<std::vector<arm::app::kws::KwsResult>>("results", results);
				242
				243	if (!_PresentInferenceResult(platform, results)) {
				244	return false;
				245	}
				246
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	247	profiler.PrintProfilingResult();
				248
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	249	_IncrementAppCtxClipIdx(ctx);
				250
				251	} while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx);
				252
				253	return true;
				254	}
				255
				256	static void _IncrementAppCtxClipIdx(ApplicationContext& ctx)
				257	{
				258	auto curAudioIdx = ctx.Get<uint32_t>("clipIndex");
				259
				260	if (curAudioIdx + 1 >= NUMBER_OF_FILES) {
				261	ctx.Set<uint32_t>("clipIndex", 0);
				262	return;
				263	}
				264	++curAudioIdx;
				265	ctx.Set<uint32_t>("clipIndex", curAudioIdx);
				266	}
				267
				268	static bool _SetAppCtxClipIdx(ApplicationContext& ctx, const uint32_t idx)
				269	{
				270	if (idx >= NUMBER_OF_FILES) {
				271	printf_err("Invalid idx %u (expected less than %u)\n",
				272	idx, NUMBER_OF_FILES);
				273	return false;
				274	}
				275	ctx.Set<uint32_t>("clipIndex", idx);
				276	return true;
				277	}
				278
				279	static bool _PresentInferenceResult(hal_platform& platform,
				280	const std::vector<arm::app::kws::KwsResult>& results)
				281	{
				282	constexpr uint32_t dataPsnTxtStartX1 = 20;
				283	constexpr uint32_t dataPsnTxtStartY1 = 30;
				284	constexpr uint32_t dataPsnTxtYIncr = 16; /* Row index increment. */
				285
				286	platform.data_psn->set_text_color(COLOR_GREEN);
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	287	info("Final results:\n");
				288	info("Total number of inferences: %zu\n", results.size());
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	289
				290	/* Display each result */
				291	uint32_t rowIdx1 = dataPsnTxtStartY1 + 2 * dataPsnTxtYIncr;
				292
				293	for (uint32_t i = 0; i < results.size(); ++i) {
				294
				295	std::string topKeyword{"<none>"};
				296	float score = 0.f;
				297
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	298	if (!results[i].m_resultVec.empty()) {
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	299	topKeyword = results[i].m_resultVec[0].m_label;
				300	score = results[i].m_resultVec[0].m_normalisedVal;
				301	}
				302
				303	std::string resultStr =
				304	std::string{"@"} + std::to_string(results[i].m_timeStamp) +
				305	std::string{"s: "} + topKeyword + std::string{" ("} +
				306	std::to_string(static_cast<int>(score * 100)) + std::string{"%)"};
				307
				308	platform.data_psn->present_data_text(
				309	resultStr.c_str(), resultStr.size(),
				310	dataPsnTxtStartX1, rowIdx1, false);
				311	rowIdx1 += dataPsnTxtYIncr;
				312
Isabella Gottardi	8df12f3	2021-04-07 17:15:31 +0100	[diff] [blame^]	313	if (results[i].m_resultVec.empty()) {
				314	info("For timestamp: %f (inference #: %u); label: %s; threshold: %f\n",
				315	results[i].m_timeStamp, results[i].m_inferenceNumber,
				316	topKeyword.c_str(),
				317	results[i].m_threshold);
				318	} else {
				319	for (uint32_t j = 0; j < results[i].m_resultVec.size(); ++j) {
				320	info("For timestamp: %f (inference #: %u); label: %s, score: %f; threshold: %f\n",
				321	results[i].m_timeStamp,
				322	results[i].m_inferenceNumber,
				323	results[i].m_resultVec[j].m_label.c_str(),
				324	results[i].m_resultVec[j].m_normalisedVal,
				325	results[i].m_threshold);
				326	}
alexander	3c79893	2021-03-26 21:42:19 +0000	[diff] [blame]	327	}
				328	}
				329
				330	return true;
				331	}
				332
				333	/**
				334	* @brief Generic feature calculator factory.
				335	*
				336	* Returns lambda function to compute features using features cache.
				337	* Real features math is done by a lambda function provided as a parameter.
				338	* Features are written to input tensor memory.
				339	*
				340	* @tparam T Feature vector type.
				341	* @param inputTensor Model input tensor pointer.
				342	* @param cacheSize Number of feature vectors to cache. Defined by the sliding window overlap.
				343	* @param compute Features calculator function.
				344	* @return Lambda function to compute features.
				345	*/
				346	template<class T>
				347	std::function<void (std::vector<int16_t>&, size_t, bool, size_t)>
				348	_FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize,
				349	std::function<std::vector<T> (std::vector<int16_t>& )> compute)
				350	{
				351	/* Feature cache to be captured by lambda function. */
				352	static std::vector<std::vector<T>> featureCache = std::vector<std::vector<T>>(cacheSize);
				353
				354	return [=](std::vector<int16_t>& audioDataWindow,
				355	size_t index,
				356	bool useCache,
				357	size_t featuresOverlapIndex)
				358	{
				359	T *tensorData = tflite::GetTensorData<T>(inputTensor);
				360	std::vector<T> features;
				361
				362	/* Reuse features from cache if cache is ready and sliding windows overlap.
				363	* Overlap is in the beginning of sliding window with a size of a feature cache. */
				364	if (useCache && index < featureCache.size()) {
				365	features = std::move(featureCache[index]);
				366	} else {
				367	features = std::move(compute(audioDataWindow));
				368	}
				369	auto size = features.size();
				370	auto sizeBytes = sizeof(T) * size;
				371	std::memcpy(tensorData + (index * size), features.data(), sizeBytes);
				372
				373	/* Start renewing cache as soon iteration goes out of the windows overlap. */
				374	if (index >= featuresOverlapIndex) {
				375	featureCache[index - featuresOverlapIndex] = std::move(features);
				376	}
				377	};
				378	}
				379
				380	template std::function<void (std::vector<int16_t>&, size_t , bool, size_t)>
				381	_FeatureCalc<int8_t>(TfLiteTensor* inputTensor,
				382	size_t cacheSize,
				383	std::function<std::vector<int8_t> (std::vector<int16_t>& )> compute);
				384
				385	template std::function<void (std::vector<int16_t>&, size_t , bool, size_t)>
				386	_FeatureCalc<uint8_t>(TfLiteTensor* inputTensor,
				387	size_t cacheSize,
				388	std::function<std::vector<uint8_t> (std::vector<int16_t>& )> compute);
				389
				390	template std::function<void (std::vector<int16_t>&, size_t , bool, size_t)>
				391	_FeatureCalc<int16_t>(TfLiteTensor* inputTensor,
				392	size_t cacheSize,
				393	std::function<std::vector<int16_t> (std::vector<int16_t>& )> compute);
				394
				395	template std::function<void(std::vector<int16_t>&, size_t, bool, size_t)>
				396	_FeatureCalc<float>(TfLiteTensor *inputTensor,
				397	size_t cacheSize,
				398	std::function<std::vector<float>(std::vector<int16_t>&)> compute);
				399
				400
				401	static std::function<void (std::vector<int16_t>&, int, bool, size_t)>
				402	GetFeatureCalculator(audio::DsCnnMFCC& mfcc, TfLiteTensor* inputTensor, size_t cacheSize)
				403	{
				404	std::function<void (std::vector<int16_t>&, size_t, bool, size_t)> mfccFeatureCalc;
				405
				406	TfLiteQuantization quant = inputTensor->quantization;
				407
				408	if (kTfLiteAffineQuantization == quant.type) {
				409
				410	auto quantParams = (TfLiteAffineQuantization ) quant.params;
				411	const float quantScale = quantParams->scale->data[0];
				412	const int quantOffset = quantParams->zero_point->data[0];
				413
				414	switch (inputTensor->type) {
				415	case kTfLiteInt8: {
				416	mfccFeatureCalc = _FeatureCalc<int8_t>(inputTensor,
				417	cacheSize,
				418	[=, &mfcc](std::vector<int16_t>& audioDataWindow) {
				419	return mfcc.MfccComputeQuant<int8_t>(audioDataWindow,
				420	quantScale,
				421	quantOffset);
				422	}
				423	);
				424	break;
				425	}
				426	case kTfLiteUInt8: {
				427	mfccFeatureCalc = _FeatureCalc<uint8_t>(inputTensor,
				428	cacheSize,
				429	[=, &mfcc](std::vector<int16_t>& audioDataWindow) {
				430	return mfcc.MfccComputeQuant<uint8_t>(audioDataWindow,
				431	quantScale,
				432	quantOffset);
				433	}
				434	);
				435	break;
				436	}
				437	case kTfLiteInt16: {
				438	mfccFeatureCalc = _FeatureCalc<int16_t>(inputTensor,
				439	cacheSize,
				440	[=, &mfcc](std::vector<int16_t>& audioDataWindow) {
				441	return mfcc.MfccComputeQuant<int16_t>(audioDataWindow,
				442	quantScale,
				443	quantOffset);
				444	}
				445	);
				446	break;
				447	}
				448	default:
				449	printf_err("Tensor type %s not supported\n", TfLiteTypeGetName(inputTensor->type));
				450	}
				451
				452
				453	} else {
				454	mfccFeatureCalc = mfccFeatureCalc = _FeatureCalc<float>(inputTensor,
				455	cacheSize,
				456	[&mfcc](std::vector<int16_t>& audioDataWindow) {
				457	return mfcc.MfccCompute(audioDataWindow);
				458	});
				459	}
				460	return mfccFeatureCalc;
				461	}
				462
				463	} /* namespace app */
				464	} /* namespace arm */