Blame - python/pyarmnn/examples/speech_recognition/audio_utils.py - ml/armnn

blob: a522a0e2a7a414120ef7df073f0b298bc3d8bf77 [file] [log] [blame]

Éanna Ó Catháin	145c88f	2020-11-16 14:12:11 +0000	[diff] [blame]	1	# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
				2	# SPDX-License-Identifier: MIT
				3
				4	"""Utilities for speech recognition apps."""
				5
				6	import numpy as np
				7	import pyarmnn as ann
				8
				9
				10	def decode(model_output: np.ndarray, labels: dict) -> str:
				11	"""Decodes the integer encoded results from inference into a string.
				12
				13	Args:
				14	model_output: Results from running inference.
				15	labels: Dictionary of labels keyed on the classification index.
				16
				17	Returns:
				18	Decoded string.
				19	"""
				20	top1_results = [labels[np.argmax(row[0])] for row in model_output]
				21	return filter_characters(top1_results)
				22
				23
				24	def filter_characters(results: list) -> str:
				25	"""Filters unwanted and duplicate characters.
				26
				27	Args:
				28	results: List of top 1 results from inference.
				29
				30	Returns:
				31	Final output string to present to user.
				32	"""
				33	text = ""
				34	for i in range(len(results)):
				35	if results[i] == "$":
				36	continue
				37	elif i + 1 < len(results) and results[i] == results[i + 1]:
				38	continue
				39	else:
				40	text += results[i]
				41	return text
				42
				43
				44	def display_text(text: str):
				45	"""Presents the results on the console.
				46
				47	Args:
				48	text: Results of performing ASR on the input audio data.
				49	"""
				50	print(text, sep="", end="", flush=True)
				51
				52
				53	def quantize_input(data, input_binding_info):
				54	"""Quantize the float input to (u)int8 ready for inputting to model."""
				55	if data.ndim != 2:
				56	raise RuntimeError("Audio data must have 2 dimensions for quantization")
				57
				58	quant_scale = input_binding_info[1].GetQuantizationScale()
				59	quant_offset = input_binding_info[1].GetQuantizationOffset()
				60	data_type = input_binding_info[1].GetDataType()
				61
				62	if data_type == ann.DataType_QAsymmS8:
				63	data_type = np.int8
				64	elif data_type == ann.DataType_QAsymmU8:
				65	data_type = np.uint8
				66	else:
				67	raise ValueError("Could not quantize data to required data type")
				68
				69	d_min = np.iinfo(data_type).min
				70	d_max = np.iinfo(data_type).max
				71
				72	for row in range(data.shape[0]):
				73	for col in range(data.shape[1]):
				74	data[row, col] = (data[row, col] / quant_scale) + quant_offset
				75	data[row, col] = np.clip(data[row, col], d_min, d_max)
				76	data = data.astype(data_type)
				77	return data
				78
				79
				80	def decode_text(is_first_window, labels, output_result):
				81	"""
				82	Slices the text appropriately depending on the window, and decodes for wav2letter output.
				83	* First run, take the left context, and inner context.
				84	* Every other run, take the inner context.
				85	Stores the current right context, and updates it for each inference. Will get used after last inference
				86
				87	Args:
				88	is_first_window: Boolean to show if it is the first window we are running inference on
				89	labels: the label set
				90	output_result: the output from the inference
				91	text: the current text string, to be displayed at the end
				92	Returns:
				93	current_r_context: the current right context
				94	text: the current text string, with the latest output decoded and appended
				95	"""
				96
				97	if is_first_window:
				98	# Since it's the first inference, keep the left context, and inner context, and decode
				99	text = decode(output_result[0][0:472], labels)
				100	else:
				101	# Only decode the inner context
				102	text = decode(output_result[0][49:472], labels)
				103
				104	# Store the right context, we will need it after the last inference
				105	current_r_context = decode(output_result[0][473:521], labels)
				106	return current_r_context, text
				107
				108
				109	def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor):
				110	"""
				111	Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the
				112	input tensors.
				113
				114	Args:
				115	audio_data: The audio data to process
				116	mfcc_instance: the mfcc class instance
				117	input_binding_info: the model input binding info
				118	mfcc_preprocessor: the mfcc preprocessor instance
				119	Returns:
				120	input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor
				121	"""
				122
				123	data_type = input_binding_info[1].GetDataType()
				124	input_tensor = mfcc_preprocessor.extract_features(audio_data)
				125	if data_type != ann.DataType_Float32:
				126	input_tensor = quantize_input(input_tensor, input_binding_info)
				127	input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor])
				128	return input_tensors