alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 1 | # Copyright © 2021 Arm Ltd and Contributors. All rights reserved. |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | """Automatic speech recognition with PyArmNN demo for processing audio clips to text.""" |
| 5 | |
| 6 | import sys |
| 7 | import os |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 8 | import numpy as np |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 9 | |
| 10 | script_dir = os.path.dirname(__file__) |
| 11 | sys.path.insert(1, os.path.join(script_dir, '..', 'common')) |
| 12 | |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 13 | from argparse import ArgumentParser |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 14 | from network_executor import ArmnnNetworkExecutor |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 15 | from utils import prepare_input_tensors |
| 16 | from audio_capture import AudioCaptureParams, capture_audio |
| 17 | from audio_utils import decode_text, display_text |
| 18 | from wav2letter_mfcc import Wav2LetterMFCC, W2LAudioPreprocessor |
| 19 | from mfcc import MFCCParams |
| 20 | |
| 21 | # Model Specific Labels |
| 22 | labels = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', |
| 23 | 13: 'n', |
| 24 | 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', |
| 25 | 25: 'z', |
| 26 | 26: "'", 27: ' ', 28: '$'} |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 27 | |
| 28 | |
| 29 | def parse_args(): |
| 30 | parser = ArgumentParser(description="ASR with PyArmNN") |
| 31 | parser.add_argument( |
| 32 | "--audio_file_path", |
| 33 | required=True, |
| 34 | type=str, |
| 35 | help="Path to the audio file to perform ASR", |
| 36 | ) |
| 37 | parser.add_argument( |
| 38 | "--model_file_path", |
| 39 | required=True, |
| 40 | type=str, |
| 41 | help="Path to ASR model to use", |
| 42 | ) |
| 43 | parser.add_argument( |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 44 | "--preferred_backends", |
| 45 | type=str, |
| 46 | nargs="+", |
| 47 | default=["CpuAcc", "CpuRef"], |
| 48 | help="""List of backends in order of preference for optimizing |
| 49 | subgraphs, falling back to the next backend in the list on unsupported |
| 50 | layers. Defaults to [CpuAcc, CpuRef]""", |
| 51 | ) |
| 52 | return parser.parse_args() |
| 53 | |
| 54 | |
| 55 | def main(args): |
| 56 | # Read command line args |
| 57 | audio_file = args.audio_file_path |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 58 | |
| 59 | # Create the ArmNN inference runner |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 60 | network = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 61 | |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 62 | # Specify model specific audio data requirements |
| 63 | audio_capture_params = AudioCaptureParams(dtype=np.float32, overlap=31712, min_samples=47712, sampling_freq=16000, |
| 64 | mono=True) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 65 | |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 66 | buffer = capture_audio(audio_file, audio_capture_params) |
| 67 | |
| 68 | # Extract features and create the preprocessor |
| 69 | |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 70 | mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000, |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 71 | num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_fft=512) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 72 | |
alexander | f42f568 | 2021-07-16 11:30:56 +0100 | [diff] [blame] | 73 | wmfcc = Wav2LetterMFCC(mfcc_params) |
| 74 | preprocessor = W2LAudioPreprocessor(wmfcc, model_input_size=296, stride=160) |
Éanna Ó Catháin | 145c88f | 2020-11-16 14:12:11 +0000 | [diff] [blame] | 75 | current_r_context = "" |
| 76 | is_first_window = True |
| 77 | |
| 78 | print("Processing Audio Frames...") |
| 79 | for audio_data in buffer: |
| 80 | # Prepare the input Tensors |
| 81 | input_tensors = prepare_input_tensors(audio_data, network.input_binding_info, preprocessor) |
| 82 | |
| 83 | # Run inference |
| 84 | output_result = network.run(input_tensors) |
| 85 | |
| 86 | # Slice and Decode the text, and store the right context |
| 87 | current_r_context, text = decode_text(is_first_window, labels, output_result) |
| 88 | |
| 89 | is_first_window = False |
| 90 | |
| 91 | display_text(text) |
| 92 | |
| 93 | print(current_r_context, flush=True) |
| 94 | |
| 95 | |
| 96 | if __name__ == "__main__": |
| 97 | args = parse_args() |
| 98 | main(args) |