blob: f2c50f3b41e7b01876ef3842acc011358f09ade0 [file] [log] [blame]
alexander3c798932021-03-26 21:42:19 +00001/*
2 * Copyright (c) 2021 Arm Limited. All rights reserved.
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17#include "Wav2LetterMfcc.hpp"
18
19#include "PlatformMath.hpp"
alexander31ae9f02022-02-10 16:15:54 +000020#include "log_macros.h"
alexander3c798932021-03-26 21:42:19 +000021
22#include <cfloat>
23
24namespace arm {
25namespace app {
26namespace audio {
27
28 bool Wav2LetterMFCC::ApplyMelFilterBank(
29 std::vector<float>& fftVec,
30 std::vector<std::vector<float>>& melFilterBank,
alexanderc350cdc2021-04-29 20:36:09 +010031 std::vector<uint32_t>& filterBankFilterFirst,
32 std::vector<uint32_t>& filterBankFilterLast,
alexander3c798932021-03-26 21:42:19 +000033 std::vector<float>& melEnergies)
34 {
35 const size_t numBanks = melEnergies.size();
36
37 if (numBanks != filterBankFilterFirst.size() ||
38 numBanks != filterBankFilterLast.size()) {
39 printf_err("unexpected filter bank lengths\n");
40 return false;
41 }
42
43 for (size_t bin = 0; bin < numBanks; ++bin) {
44 auto filterBankIter = melFilterBank[bin].begin();
alexanderc350cdc2021-04-29 20:36:09 +010045 auto end = melFilterBank[bin].end();
46 /* Avoid log of zero at later stages, same value used in librosa.
47 * The number was used during our default wav2letter model training. */
48 float melEnergy = 1e-10;
49 const uint32_t firstIndex = filterBankFilterFirst[bin];
50 const uint32_t lastIndex = std::min<uint32_t>(filterBankFilterLast[bin], fftVec.size() - 1);
alexander3c798932021-03-26 21:42:19 +000051
alexanderc350cdc2021-04-29 20:36:09 +010052 for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) {
alexander3c798932021-03-26 21:42:19 +000053 melEnergy += (*filterBankIter++ * fftVec[i]);
54 }
55
56 melEnergies[bin] = melEnergy;
57 }
58
59 return true;
60 }
61
62 void Wav2LetterMFCC::ConvertToLogarithmicScale(
63 std::vector<float>& melEnergies)
64 {
65 float maxMelEnergy = -FLT_MAX;
66
67 /* Container for natural logarithms of mel energies. */
68 std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
69
70 /* Because we are taking natural logs, we need to multiply by log10(e).
71 * Also, for wav2letter model, we scale our log10 values by 10. */
72 constexpr float multiplier = 10.0 * /* Default scalar. */
73 0.4342944819032518; /* log10f(std::exp(1.0))*/
74
75 /* Take log of the whole vector. */
76 math::MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
77
78 /* Scale the log values and get the max. */
79 for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
alexanderc350cdc2021-04-29 20:36:09 +010080 iterM != melEnergies.end() && iterL != vecLogEnergies.end(); ++iterM, ++iterL) {
alexander3c798932021-03-26 21:42:19 +000081
82 *iterM = *iterL * multiplier;
83
84 /* Save the max mel energy. */
85 if (*iterM > maxMelEnergy) {
86 maxMelEnergy = *iterM;
87 }
88 }
89
90 /* Clamp the mel energies. */
91 constexpr float maxDb = 80.0;
92 const float clampLevelLowdB = maxMelEnergy - maxDb;
alexanderc350cdc2021-04-29 20:36:09 +010093 for (float & melEnergie : melEnergies) {
94 melEnergie = std::max(melEnergie, clampLevelLowdB);
alexander3c798932021-03-26 21:42:19 +000095 }
96 }
97
98 std::vector<float> Wav2LetterMFCC::CreateDCTMatrix(
99 const int32_t inputLength,
100 const int32_t coefficientCount)
101 {
102 std::vector<float> dctMatix(inputLength * coefficientCount);
103
104 /* Orthonormal normalization. */
105 const float normalizerK0 = 2 * math::MathUtils::SqrtF32(1.0f /
106 static_cast<float>(4*inputLength));
107 const float normalizer = 2 * math::MathUtils::SqrtF32(1.0f /
108 static_cast<float>(2*inputLength));
109
110 const float angleIncr = M_PI/inputLength;
111 float angle = angleIncr; /* We start using it at k = 1 loop. */
112
113 /* First row of DCT will use normalizer K0 */
114 for (int32_t n = 0; n < inputLength; ++n) {
115 dctMatix[n] = normalizerK0 /* cos(0) = 1 */;
116 }
117
118 /* Second row (index = 1) onwards, we use standard normalizer. */
119 for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) {
120 for (int32_t n = 0; n < inputLength; ++n) {
121 dctMatix[m+n] = normalizer *
122 math::MathUtils::CosineF32((n + 0.5f) * angle);
123 }
124 angle += angleIncr;
125 }
126 return dctMatix;
127 }
128
129 float Wav2LetterMFCC::GetMelFilterBankNormaliser(
130 const float& leftMel,
131 const float& rightMel,
132 const bool useHTKMethod)
133 {
134 /* Slaney normalization for mel weights. */
135 return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
136 MFCC::InverseMelScale(leftMel, useHTKMethod)));
137 }
138
139} /* namespace audio */
140} /* namespace app */
141} /* namespace arm */