blob: 9f28d1006e59c562074667aaefef0bb89111ed05 [file] [log] [blame]
Éanna Ó Catháin145c88f2020-11-16 14:12:11 +00001# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
2# SPDX-License-Identifier: MIT
3
4"""Contains AudioCapture class for capturing chunks of audio data from file."""
5
6from typing import Generator
7
8import numpy as np
9import soundfile as sf
10
11
12class ModelParams:
13 def __init__(self, model_file_path: str):
14 """Defines sampling parameters for model used.
15
16 Args:
17 model_file_path: Path to ASR model to use.
18 """
19 self.path = model_file_path
20 self.mono = True
21 self.dtype = np.float32
22 self.samplerate = 16000
23 self.min_samples = 167392
24
25
26class AudioCapture:
27 def __init__(self, model_params):
28 """Sampling parameters for model used."""
29 self.model_params = model_params
30
31 def from_audio_file(self, audio_file_path, overlap=31712) -> Generator[np.ndarray, None, None]:
32 """Creates a generator that yields audio data from a file. Data is padded with
33 zeros if necessary to make up minimum number of samples.
34
35 Args:
36 audio_file_path: Path to audio file provided by user.
37 overlap: The overlap with previous buffer. We need the offset to be the same as the inner context
38 of the mfcc output, which is sized as 100 x 39. Each mfcc compute produces 1 x 39 vector,
39 and consumes 160 audio samples. The default overlap is then calculated to be 47712 - (160 x 100)
40 where 47712 is the min_samples needed for 1 inference of wav2letter.
41
42 Yields:
43 Blocks of audio data of minimum sample size.
44 """
45 with sf.SoundFile(audio_file_path) as audio_file:
46 for block in audio_file.blocks(
47 blocksize=self.model_params.min_samples,
48 dtype=self.model_params.dtype,
49 always_2d=True,
50 fill_value=0,
51 overlap=overlap
52 ):
53 # Convert to mono if specified
54 if self.model_params.mono and block.shape[0] > 1:
55 block = np.mean(block, axis=1)
56 yield block