blob: f03d2e1290e829d00cac7995c25cf29334a73dae [file] [log] [blame]
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +00001# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
2# SPDX-License-Identifier: MIT
3
4"""Utilities for speech recognition apps."""
5
6import numpy as np
7import pyarmnn as ann
8
9
10def decode(model_output: np.ndarray, labels: dict) -> str:
11 """Decodes the integer encoded results from inference into a string.
12
13 Args:
14 model_output: Results from running inference.
15 labels: Dictionary of labels keyed on the classification index.
16
17 Returns:
18 Decoded string.
19 """
Nina Drozd4018b212021-02-02 17:49:17 +000020 top1_results = [labels[np.argmax(row)] for row in model_output]
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000021 return filter_characters(top1_results)
22
23
24def filter_characters(results: list) -> str:
25 """Filters unwanted and duplicate characters.
26
27 Args:
28 results: List of top 1 results from inference.
29
30 Returns:
31 Final output string to present to user.
32 """
33 text = ""
34 for i in range(len(results)):
35 if results[i] == "$":
36 continue
37 elif i + 1 < len(results) and results[i] == results[i + 1]:
38 continue
39 else:
40 text += results[i]
41 return text
42
43
44def display_text(text: str):
45 """Presents the results on the console.
46
47 Args:
48 text: Results of performing ASR on the input audio data.
49 """
50 print(text, sep="", end="", flush=True)
51
52
53def quantize_input(data, input_binding_info):
54 """Quantize the float input to (u)int8 ready for inputting to model."""
55 if data.ndim != 2:
56 raise RuntimeError("Audio data must have 2 dimensions for quantization")
57
58 quant_scale = input_binding_info[1].GetQuantizationScale()
59 quant_offset = input_binding_info[1].GetQuantizationOffset()
60 data_type = input_binding_info[1].GetDataType()
61
62 if data_type == ann.DataType_QAsymmS8:
63 data_type = np.int8
64 elif data_type == ann.DataType_QAsymmU8:
65 data_type = np.uint8
66 else:
67 raise ValueError("Could not quantize data to required data type")
68
69 d_min = np.iinfo(data_type).min
70 d_max = np.iinfo(data_type).max
71
72 for row in range(data.shape[0]):
73 for col in range(data.shape[1]):
74 data[row, col] = (data[row, col] / quant_scale) + quant_offset
75 data[row, col] = np.clip(data[row, col], d_min, d_max)
76 data = data.astype(data_type)
77 return data
78
79
80def decode_text(is_first_window, labels, output_result):
81 """
82 Slices the text appropriately depending on the window, and decodes for wav2letter output.
83 * First run, take the left context, and inner context.
84 * Every other run, take the inner context.
Nina Drozd4018b212021-02-02 17:49:17 +000085 Stores the current right context, and updates it for each inference. Will get used after last inference.
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000086
87 Args:
88 is_first_window: Boolean to show if it is the first window we are running inference on
89 labels: the label set
90 output_result: the output from the inference
91 text: the current text string, to be displayed at the end
92 Returns:
93 current_r_context: the current right context
94 text: the current text string, with the latest output decoded and appended
95 """
Nina Drozd4018b212021-02-02 17:49:17 +000096 # For wav2letter with 148 output steps:
97 # Left context is index 0-48, inner context 49-99, right context 100-147
98 inner_context_start = 49
99 inner_context_end = 99
100 right_context_start = 100
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +0000101
102 if is_first_window:
103 # Since it's the first inference, keep the left context, and inner context, and decode
Nina Drozd4018b212021-02-02 17:49:17 +0000104 text = decode(output_result[0][0][0][0:inner_context_end], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +0000105 else:
106 # Only decode the inner context
Nina Drozd4018b212021-02-02 17:49:17 +0000107 text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +0000108
109 # Store the right context, we will need it after the last inference
Nina Drozd4018b212021-02-02 17:49:17 +0000110 current_r_context = decode(output_result[0][0][0][right_context_start:], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +0000111 return current_r_context, text
112
113
114def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor):
115 """
116 Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the
117 input tensors.
118
119 Args:
120 audio_data: The audio data to process
121 mfcc_instance: the mfcc class instance
122 input_binding_info: the model input binding info
123 mfcc_preprocessor: the mfcc preprocessor instance
124 Returns:
125 input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor
126 """
127
128 data_type = input_binding_info[1].GetDataType()
129 input_tensor = mfcc_preprocessor.extract_features(audio_data)
130 if data_type != ann.DataType_Float32:
131 input_tensor = quantize_input(input_tensor, input_binding_info)
132 input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor])
133 return input_tensors