blob: 1ac78e8074770cf831563ad266a13a9175d04030 [file] [log] [blame]
alexanderf42f5682021-07-16 11:30:56 +01001# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +00002# SPDX-License-Identifier: MIT
3
4"""Utilities for speech recognition apps."""
5
6import numpy as np
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +00007
8
9def decode(model_output: np.ndarray, labels: dict) -> str:
10 """Decodes the integer encoded results from inference into a string.
11
12 Args:
13 model_output: Results from running inference.
14 labels: Dictionary of labels keyed on the classification index.
15
16 Returns:
17 Decoded string.
18 """
Nina Drozd4018b212021-02-02 17:49:17 +000019 top1_results = [labels[np.argmax(row)] for row in model_output]
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000020 return filter_characters(top1_results)
21
22
23def filter_characters(results: list) -> str:
24 """Filters unwanted and duplicate characters.
25
26 Args:
27 results: List of top 1 results from inference.
28
29 Returns:
30 Final output string to present to user.
31 """
32 text = ""
33 for i in range(len(results)):
34 if results[i] == "$":
35 continue
36 elif i + 1 < len(results) and results[i] == results[i + 1]:
37 continue
38 else:
39 text += results[i]
40 return text
41
42
43def display_text(text: str):
44 """Presents the results on the console.
45
46 Args:
47 text: Results of performing ASR on the input audio data.
48 """
49 print(text, sep="", end="", flush=True)
50
51
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000052def decode_text(is_first_window, labels, output_result):
53 """
54 Slices the text appropriately depending on the window, and decodes for wav2letter output.
55 * First run, take the left context, and inner context.
56 * Every other run, take the inner context.
Nina Drozd4018b212021-02-02 17:49:17 +000057 Stores the current right context, and updates it for each inference. Will get used after last inference.
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000058
59 Args:
60 is_first_window: Boolean to show if it is the first window we are running inference on
61 labels: the label set
62 output_result: the output from the inference
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000063 Returns:
64 current_r_context: the current right context
65 text: the current text string, with the latest output decoded and appended
66 """
Nina Drozd4018b212021-02-02 17:49:17 +000067 # For wav2letter with 148 output steps:
68 # Left context is index 0-48, inner context 49-99, right context 100-147
69 inner_context_start = 49
70 inner_context_end = 99
71 right_context_start = 100
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000072
73 if is_first_window:
74 # Since it's the first inference, keep the left context, and inner context, and decode
Nina Drozd4018b212021-02-02 17:49:17 +000075 text = decode(output_result[0][0][0][0:inner_context_end], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000076 else:
77 # Only decode the inner context
Nina Drozd4018b212021-02-02 17:49:17 +000078 text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000079
80 # Store the right context, we will need it after the last inference
Nina Drozd4018b212021-02-02 17:49:17 +000081 current_r_context = decode(output_result[0][0][0][right_context_start:], labels)
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +000082 return current_r_context, text