alexander | 3c79893 | 2021-03-26 21:42:19 +0000 | [diff] [blame] | 1 | #!env/bin/python3 |
| 2 | |
| 3 | # Copyright (c) 2021 Arm Limited. All rights reserved. |
| 4 | # SPDX-License-Identifier: Apache-2.0 |
| 5 | # |
| 6 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | # you may not use this file except in compliance with the License. |
| 8 | # You may obtain a copy of the License at |
| 9 | # |
| 10 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | # |
| 12 | # Unless required by applicable law or agreed to in writing, software |
| 13 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | # See the License for the specific language governing permissions and |
| 16 | # limitations under the License. |
| 17 | |
| 18 | import soundfile as sf |
| 19 | import resampy |
| 20 | import numpy as np |
| 21 | |
| 22 | |
| 23 | class AudioUtils: |
| 24 | @staticmethod |
| 25 | def res_data_type(res_type_value): |
| 26 | """ |
| 27 | Returns the input string if is one of the valid resample type |
| 28 | """ |
| 29 | import argparse |
| 30 | if res_type_value not in AudioUtils.res_type_list(): |
| 31 | raise argparse.ArgumentTypeError(f"{res_type_value} not valid. Supported only {AudioUtils.res_type_list()}") |
| 32 | return res_type_value |
| 33 | |
| 34 | @staticmethod |
| 35 | def res_type_list(): |
| 36 | """ |
| 37 | Returns the resample type list |
| 38 | """ |
| 39 | return ['kaiser_best', 'kaiser_fast'] |
| 40 | |
| 41 | @staticmethod |
| 42 | def load_resample_audio_clip(path, target_sr=16000, mono=True, offset=0.0, duration=0, res_type='kaiser_best', |
| 43 | min_len=16000): |
| 44 | """ |
| 45 | Load and resample an audio clip with the given desired specs. |
| 46 | |
| 47 | Parameters: |
| 48 | ---------- |
| 49 | path (string): Path to the input audio clip. |
| 50 | target_sr (int, optional): Target sampling rate. Positive number are considered valid, |
| 51 | if zero or negative the native sampling rate of the file will be preserved. Default is 16000. |
| 52 | mono (bool, optional): Specify if the audio file needs to be converted to mono. Default is True. |
| 53 | offset (float, optional): Target sampling rate. Default is 0.0. |
| 54 | duration (int, optional): Target duration. Positive number are considered valid, |
| 55 | if zero or negative the duration of the file will be preserved. Default is 0. |
| 56 | res_type (int, optional): Resample type to use, Default is 'kaiser_best'. |
| 57 | min_len (int, optional): Minimun lenght of the output audio time series. Default is 16000. |
| 58 | |
| 59 | Returns: |
| 60 | ---------- |
| 61 | y (np.ndarray): Output audio time series of shape shape=(n,) or (2, n). |
| 62 | sr (int): A scalar number > 0 that represent the sampling rate of `y` |
| 63 | """ |
| 64 | try: |
| 65 | with sf.SoundFile(path) as audio_file: |
| 66 | origin_sr = audio_file.samplerate |
| 67 | |
| 68 | if offset: |
| 69 | # Seek to the start of the target read |
| 70 | audio_file.seek(int(offset * origin_sr)) |
| 71 | |
| 72 | if duration > 0: |
| 73 | num_frame_duration = int(duration * origin_sr) |
| 74 | else: |
| 75 | num_frame_duration = -1 |
| 76 | |
| 77 | # Load the target number of frames |
| 78 | y = audio_file.read(frames=num_frame_duration, dtype=np.float32, always_2d=False).T |
| 79 | |
| 80 | except: |
| 81 | print(f"Failed to open {path} as an audio.") |
| 82 | |
| 83 | # Convert to mono if requested and if audio has more than one dimension |
| 84 | if mono and (y.ndim > 1): |
| 85 | y = np.mean(y, axis=0) |
| 86 | |
| 87 | if not (origin_sr == target_sr) and (target_sr > 0): |
| 88 | ratio = float(target_sr) / origin_sr |
| 89 | axis = -1 |
| 90 | n_samples = int(np.ceil(y.shape[axis] * ratio)) |
| 91 | |
| 92 | # Resample using resampy |
| 93 | y_rs = resampy.resample(y, origin_sr, target_sr, filter=res_type, axis=axis) |
| 94 | n_rs_samples = y_rs.shape[axis] |
| 95 | |
| 96 | # Adjust the size |
| 97 | if n_rs_samples > n_samples: |
| 98 | slices = [slice(None)] * y_rs.ndim |
| 99 | slices[axis] = slice(0, n_samples) |
| 100 | y = y_rs[tuple(slices)] |
| 101 | elif n_rs_samples < n_samples: |
| 102 | lengths = [(0, 0)] * y_rs.ndim |
| 103 | lengths[axis] = (0, n_samples - n_rs_samples) |
| 104 | y = np.pad(y_rs, lengths, 'constant', constant_values=(0)) |
| 105 | |
| 106 | sr = target_sr |
| 107 | else: |
| 108 | sr = origin_sr |
| 109 | |
| 110 | # Pad if necessary and min lenght is setted (min_len> 0) |
| 111 | if (y.shape[0] < min_len) and (min_len > 0): |
| 112 | sample_to_pad = min_len - y.shape[0] |
| 113 | y = np.pad(y, (0, sample_to_pad), 'constant', constant_values=(0)) |
| 114 | |
| 115 | return y, sr |