alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 1 | # Copyright © 2021 Arm Ltd and Contributors. All rights reserved. |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | """Utilities for speech recognition apps.""" |
| 5 | |
| 6 | import numpy as np |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 7 | |
| 8 | |
| 9 | def decode(model_output: np.ndarray, labels: dict) -> str: |
| 10 | """Decodes the integer encoded results from inference into a string. |
| 11 | |
| 12 | Args: |
| 13 | model_output: Results from running inference. |
| 14 | labels: Dictionary of labels keyed on the classification index. |
| 15 | |
| 16 | Returns: |
| 17 | Decoded string. |
| 18 | """ |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 19 | top1_results = [labels[np.argmax(row)] for row in model_output] |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 20 | return filter_characters(top1_results) |
| 21 | |
| 22 | |
| 23 | def filter_characters(results: list) -> str: |
| 24 | """Filters unwanted and duplicate characters. |
| 25 | |
| 26 | Args: |
| 27 | results: List of top 1 results from inference. |
| 28 | |
| 29 | Returns: |
| 30 | Final output string to present to user. |
| 31 | """ |
| 32 | text = "" |
| 33 | for i in range(len(results)): |
| 34 | if results[i] == "$": |
| 35 | continue |
| 36 | elif i + 1 < len(results) and results[i] == results[i + 1]: |
| 37 | continue |
| 38 | else: |
| 39 | text += results[i] |
| 40 | return text |
| 41 | |
| 42 | |
| 43 | def display_text(text: str): |
| 44 | """Presents the results on the console. |
| 45 | |
| 46 | Args: |
| 47 | text: Results of performing ASR on the input audio data. |
| 48 | """ |
| 49 | print(text, sep="", end="", flush=True) |
| 50 | |
| 51 | |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 52 | def decode_text(is_first_window, labels, output_result): |
| 53 | """ |
| 54 | Slices the text appropriately depending on the window, and decodes for wav2letter output. |
| 55 | * First run, take the left context, and inner context. |
| 56 | * Every other run, take the inner context. |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 57 | Stores the current right context, and updates it for each inference. Will get used after last inference. |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 58 | |
| 59 | Args: |
| 60 | is_first_window: Boolean to show if it is the first window we are running inference on |
| 61 | labels: the label set |
| 62 | output_result: the output from the inference |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 63 | Returns: |
| 64 | current_r_context: the current right context |
| 65 | text: the current text string, with the latest output decoded and appended |
| 66 | """ |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 67 | # For wav2letter with 148 output steps: |
| 68 | # Left context is index 0-48, inner context 49-99, right context 100-147 |
| 69 | inner_context_start = 49 |
| 70 | inner_context_end = 99 |
| 71 | right_context_start = 100 |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 72 | |
| 73 | if is_first_window: |
| 74 | # Since it's the first inference, keep the left context, and inner context, and decode |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 75 | text = decode(output_result[0][0][0][0:inner_context_end], labels) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 76 | else: |
| 77 | # Only decode the inner context |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 78 | text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 79 | |
| 80 | # Store the right context, we will need it after the last inference |
Nina Drozd | 4018b21 | 2021-02-02 17:49:17 +0000 | [diff] [blame] | 81 | current_r_context = decode(output_result[0][0][0][right_context_start:], labels) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 82 | return current_r_context, text |