| // |
| // Copyright © 2020 Arm Ltd and Contributors. All rights reserved. |
| // SPDX-License-Identifier: MIT |
| // |
| |
| #include "SpeechRecognitionPipeline.hpp" |
| #include "ArmnnNetworkExecutor.hpp" |
| |
| namespace asr |
| { |
| |
| ASRPipeline::ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor, |
| std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preProcessor) : |
| m_executor(std::move(executor)), |
| m_decoder(std::move(decoder)), m_preProcessor(std::move(preProcessor)) {} |
| |
| int ASRPipeline::getInputSamplesSize() |
| { |
| return this->m_preProcessor->m_windowLen + |
| ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * this->m_preProcessor->m_windowStride); |
| } |
| |
| int ASRPipeline::getSlidingWindowOffset() |
| { |
| // Hardcoded for now until refactor |
| return ASRPipeline::SLIDING_WINDOW_OFFSET; |
| } |
| |
| std::vector<int8_t> ASRPipeline::PreProcessing(std::vector<float>& audio) |
| { |
| int audioDataToPreProcess = m_preProcessor->m_windowLen + |
| ((m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * |
| m_preProcessor->m_windowStride); |
| int outputBufferSize = m_preProcessor->m_mfcc->m_params.m_numMfccVectors |
| * m_preProcessor->m_mfcc->m_params.m_numMfccFeatures * 3; |
| std::vector<int8_t> outputBuffer(outputBufferSize); |
| m_preProcessor->Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(), |
| m_executor->GetQuantizationScale()); |
| return outputBuffer; |
| } |
| |
| IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels) |
| { |
| if (config.m_ModelName == "Wav2Letter") |
| { |
| // Wav2Letter ASR SETTINGS |
| int SAMP_FREQ = 16000; |
| int FRAME_LEN_MS = 32; |
| int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001; |
| int NUM_MFCC_FEATS = 13; |
| int MFCC_WINDOW_LEN = 512; |
| int MFCC_WINDOW_STRIDE = 160; |
| const int NUM_MFCC_VECTORS = 296; |
| int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS - 1) * MFCC_WINDOW_STRIDE); |
| int MEL_LO_FREQ = 0; |
| int MEL_HI_FREQ = 8000; |
| int NUM_FBANK_BIN = 128; |
| int INPUT_WINDOW_LEFT_CONTEXT = 98; |
| int INPUT_WINDOW_RIGHT_CONTEXT = 98; |
| int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS - |
| (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT); |
| int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE; |
| |
| |
| MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN, |
| MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS); |
| |
| std::unique_ptr<Wav2LetterMFCC> mfccInst = std::make_unique<Wav2LetterMFCC>(mfccParams); |
| |
| auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(config.m_ModelFilePath, |
| config.m_backends); |
| |
| auto decoder = std::make_unique<asr::Decoder>(labels); |
| |
| auto preprocessor = std::make_unique<Wav2LetterPreprocessor>(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, |
| std::move(mfccInst)); |
| |
| auto ptr = std::make_unique<asr::ASRPipeline>( |
| std::move(executor), std::move(decoder), std::move(preprocessor)); |
| |
| ptr->SLIDING_WINDOW_OFFSET = SLIDING_WINDOW_OFFSET; |
| |
| return ptr; |
| } |
| else |
| { |
| throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " ."); |
| } |
| } |
| |
| }// namespace asr |