Blame - python/pyarmnn/examples/speech_recognition/audio_utils.py - ml/armnn

blob: f03d2e1290e829d00cac7995c25cf29334a73dae [file] [log] [blame]

Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	1	# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
				2	# SPDX-License-Identifier: MIT
				3
				4	"""Utilities for speech recognition apps."""
				5
				6	import numpy as np
				7	import pyarmnn as ann
				8
				9
				10	def decode(model_output: np.ndarray, labels: dict) -> str:
				11	"""Decodes the integer encoded results from inference into a string.
				12
				13	Args:
				14	model_output: Results from running inference.
				15	labels: Dictionary of labels keyed on the classification index.
				16
				17	Returns:
				18	Decoded string.
				19	"""
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	20	top1_results = [labels[np.argmax(row)] for row in model_output]
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	21	return filter_characters(top1_results)
				22
				23
				24	def filter_characters(results: list) -> str:
				25	"""Filters unwanted and duplicate characters.
				26
				27	Args:
				28	results: List of top 1 results from inference.
				29
				30	Returns:
				31	Final output string to present to user.
				32	"""
				33	text = ""
				34	for i in range(len(results)):
				35	if results[i] == "$":
				36	continue
				37	elif i + 1 < len(results) and results[i] == results[i + 1]:
				38	continue
				39	else:
				40	text += results[i]
				41	return text
				42
				43
				44	def display_text(text: str):
				45	"""Presents the results on the console.
				46
				47	Args:
				48	text: Results of performing ASR on the input audio data.
				49	"""
				50	print(text, sep="", end="", flush=True)
				51
				52
				53	def quantize_input(data, input_binding_info):
				54	"""Quantize the float input to (u)int8 ready for inputting to model."""
				55	if data.ndim != 2:
				56	raise RuntimeError("Audio data must have 2 dimensions for quantization")
				57
				58	quant_scale = input_binding_info[1].GetQuantizationScale()
				59	quant_offset = input_binding_info[1].GetQuantizationOffset()
				60	data_type = input_binding_info[1].GetDataType()
				61
				62	if data_type == ann.DataType_QAsymmS8:
				63	data_type = np.int8
				64	elif data_type == ann.DataType_QAsymmU8:
				65	data_type = np.uint8
				66	else:
				67	raise ValueError("Could not quantize data to required data type")
				68
				69	d_min = np.iinfo(data_type).min
				70	d_max = np.iinfo(data_type).max
				71
				72	for row in range(data.shape[0]):
				73	for col in range(data.shape[1]):
				74	data[row, col] = (data[row, col] / quant_scale) + quant_offset
				75	data[row, col] = np.clip(data[row, col], d_min, d_max)
				76	data = data.astype(data_type)
				77	return data
				78
				79
				80	def decode_text(is_first_window, labels, output_result):
				81	"""
				82	Slices the text appropriately depending on the window, and decodes for wav2letter output.
				83	* First run, take the left context, and inner context.
				84	* Every other run, take the inner context.
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	85	Stores the current right context, and updates it for each inference. Will get used after last inference.
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	86
				87	Args:
				88	is_first_window: Boolean to show if it is the first window we are running inference on
				89	labels: the label set
				90	output_result: the output from the inference
				91	text: the current text string, to be displayed at the end
				92	Returns:
				93	current_r_context: the current right context
				94	text: the current text string, with the latest output decoded and appended
				95	"""
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	96	# For wav2letter with 148 output steps:
				97	# Left context is index 0-48, inner context 49-99, right context 100-147
				98	inner_context_start = 49
				99	inner_context_end = 99
				100	right_context_start = 100
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	101
				102	if is_first_window:
				103	# Since it's the first inference, keep the left context, and inner context, and decode
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	104	text = decode(output_result[0][0][0][0:inner_context_end], labels)
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	105	else:
				106	# Only decode the inner context
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	107	text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels)
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	108
				109	# Store the right context, we will need it after the last inference
Nina Drozd	4018b21	2021-02-02 17:49:17 +0000	[diff] [blame]	110	current_r_context = decode(output_result[0][0][0][right_context_start:], labels)
Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	111	return current_r_context, text
				112
				113
				114	def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor):
				115	"""
				116	Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the
				117	input tensors.
				118
				119	Args:
				120	audio_data: The audio data to process
				121	mfcc_instance: the mfcc class instance
				122	input_binding_info: the model input binding info
				123	mfcc_preprocessor: the mfcc preprocessor instance
				124	Returns:
				125	input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor
				126	"""
				127
				128	data_type = input_binding_info[1].GetDataType()
				129	input_tensor = mfcc_preprocessor.extract_features(audio_data)
				130	if data_type != ann.DataType_Float32:
				131	input_tensor = quantize_input(input_tensor, input_binding_info)
				132	input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor])
				133	return input_tensors