| # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. |
| # SPDX-License-Identifier: MIT |
| |
| """Contains AudioCapture class for capturing chunks of audio data from file.""" |
| |
| from typing import Generator |
| |
| import numpy as np |
| import soundfile as sf |
| |
| |
| class ModelParams: |
| def __init__(self, model_file_path: str): |
| """Defines sampling parameters for model used. |
| |
| Args: |
| model_file_path: Path to ASR model to use. |
| """ |
| self.path = model_file_path |
| self.mono = True |
| self.dtype = np.float32 |
| self.samplerate = 16000 |
| self.min_samples = 47712 # (model_input_size-1)*stride + frame_len |
| |
| |
| class AudioCapture: |
| def __init__(self, model_params): |
| """Sampling parameters for model used.""" |
| self.model_params = model_params |
| |
| def from_audio_file(self, audio_file_path, overlap=31712) -> Generator[np.ndarray, None, None]: |
| """Creates a generator that yields audio data from a file. Data is padded with |
| zeros if necessary to make up minimum number of samples. |
| |
| Args: |
| audio_file_path: Path to audio file provided by user. |
| overlap: The overlap with previous buffer. We need the offset to be the same as the inner context |
| of the mfcc output, which is sized as 100 x 39. Each mfcc compute produces 1 x 39 vector, |
| and consumes 160 audio samples. The default overlap is then calculated to be 47712 - (160 x 100) |
| where 47712 is the min_samples needed for 1 inference of wav2letter. |
| |
| Yields: |
| Blocks of audio data of minimum sample size. |
| """ |
| with sf.SoundFile(audio_file_path) as audio_file: |
| for block in audio_file.blocks( |
| blocksize=self.model_params.min_samples, |
| dtype=self.model_params.dtype, |
| always_2d=True, |
| fill_value=0, |
| overlap=overlap |
| ): |
| # Convert to mono if specified |
| if self.model_params.mono and block.shape[0] > 1: |
| block = np.mean(block, axis=1) |
| yield block |