python/pyarmnn/examples/speech_recognition/audio_capture.py - ml/armnn - Gitiles

 # Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT

 """Contains AudioCapture class for capturing chunks of audio data from file."""

 from typing import Generator

 import numpy as np
 import soundfile as sf


 class ModelParams:
     def __init__(self, model_file_path: str):
         """Defines sampling parameters for model used.

         Args:
             model_file_path: Path to ASR model to use.
         """
         self.path = model_file_path
         self.mono = True
         self.dtype = np.float32
         self.samplerate = 16000
         self.min_samples = 47712  # (model_input_size-1)*stride + frame_len


 class AudioCapture:
     def __init__(self, model_params):
         """Sampling parameters for model used."""
         self.model_params = model_params

     def from_audio_file(self, audio_file_path, overlap=31712) -> Generator[np.ndarray, None, None]:
         """Creates a generator that yields audio data from a file. Data is padded with
         zeros if necessary to make up minimum number of samples.

         Args:
             audio_file_path: Path to audio file provided by user.
             overlap: The overlap with previous buffer. We need the offset to be the same as the inner context
                     of the mfcc output, which is sized as 100 x 39. Each mfcc compute produces 1 x 39 vector,
                     and consumes 160 audio samples. The default overlap is then calculated to be 47712 - (160 x 100)
                     where 47712 is the min_samples needed for 1 inference of wav2letter.

         Yields:
             Blocks of audio data of minimum sample size.
         """
         with sf.SoundFile(audio_file_path) as audio_file:
             for block in audio_file.blocks(
                     blocksize=self.model_params.min_samples,
                     dtype=self.model_params.dtype,
                     always_2d=True,
                     fill_value=0,
                     overlap=overlap
             ):
                 # Convert to mono if specified
                 if self.model_params.mono and block.shape[0] > 1:
                     block = np.mean(block, axis=1)
                 yield block
	# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
	# SPDX-License-Identifier: MIT

	"""Contains AudioCapture class for capturing chunks of audio data from file."""

	from typing import Generator

	import numpy as np
	import soundfile as sf


	class ModelParams:
	def __init__(self, model_file_path: str):
	"""Defines sampling parameters for model used.

	Args:
	model_file_path: Path to ASR model to use.
	"""
	self.path = model_file_path
	self.mono = True
	self.dtype = np.float32
	self.samplerate = 16000
	self.min_samples = 47712 # (model_input_size-1)*stride + frame_len


	class AudioCapture:
	def __init__(self, model_params):
	"""Sampling parameters for model used."""
	self.model_params = model_params

	def from_audio_file(self, audio_file_path, overlap=31712) -> Generator[np.ndarray, None, None]:
	"""Creates a generator that yields audio data from a file. Data is padded with
	zeros if necessary to make up minimum number of samples.

	Args:
	audio_file_path: Path to audio file provided by user.
	overlap: The overlap with previous buffer. We need the offset to be the same as the inner context
	of the mfcc output, which is sized as 100 x 39. Each mfcc compute produces 1 x 39 vector,
	and consumes 160 audio samples. The default overlap is then calculated to be 47712 - (160 x 100)
	where 47712 is the min_samples needed for 1 inference of wav2letter.

	Yields:
	Blocks of audio data of minimum sample size.
	"""
	with sf.SoundFile(audio_file_path) as audio_file:
	for block in audio_file.blocks(
	blocksize=self.model_params.min_samples,
	dtype=self.model_params.dtype,
	always_2d=True,
	fill_value=0,
	overlap=overlap
	):
	# Convert to mono if specified
	if self.model_params.mono and block.shape[0] > 1:
	block = np.mean(block, axis=1)
	yield block