blob: 8b7dd11cb4df54807fb548606a97191ab6cae444 [file] [log] [blame]
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +01001//
2// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5
6#include "SpeechRecognitionPipeline.hpp"
7#include "ArmnnNetworkExecutor.hpp"
8
George Gekov23c26272021-08-16 11:32:10 +01009namespace asr
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010010{
George Gekov23c26272021-08-16 11:32:10 +010011
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010012ASRPipeline::ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
George Gekov23c26272021-08-16 11:32:10 +010013 std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preProcessor) :
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010014 m_executor(std::move(executor)),
George Gekov23c26272021-08-16 11:32:10 +010015 m_decoder(std::move(decoder)), m_preProcessor(std::move(preProcessor)) {}
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010016
George Gekov23c26272021-08-16 11:32:10 +010017int ASRPipeline::getInputSamplesSize()
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010018{
George Gekov23c26272021-08-16 11:32:10 +010019 return this->m_preProcessor->m_windowLen +
20 ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * this->m_preProcessor->m_windowStride);
21}
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010022
George Gekov23c26272021-08-16 11:32:10 +010023int ASRPipeline::getSlidingWindowOffset()
24{
25 // Hardcoded for now until refactor
26 return ASRPipeline::SLIDING_WINDOW_OFFSET;
27}
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010028
George Gekov23c26272021-08-16 11:32:10 +010029std::vector<int8_t> ASRPipeline::PreProcessing(std::vector<float>& audio)
30{
31 int audioDataToPreProcess = m_preProcessor->m_windowLen +
32 ((m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) *
33 m_preProcessor->m_windowStride);
34 int outputBufferSize = m_preProcessor->m_mfcc->m_params.m_numMfccVectors
35 * m_preProcessor->m_mfcc->m_params.m_numMfccFeatures * 3;
36 std::vector<int8_t> outputBuffer(outputBufferSize);
37 m_preProcessor->Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
38 m_executor->GetQuantizationScale());
39 return outputBuffer;
40}
41
42IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels)
43{
44 if (config.m_ModelName == "Wav2Letter")
45 {
46 // Wav2Letter ASR SETTINGS
47 int SAMP_FREQ = 16000;
48 int FRAME_LEN_MS = 32;
49 int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001;
50 int NUM_MFCC_FEATS = 13;
51 int MFCC_WINDOW_LEN = 512;
52 int MFCC_WINDOW_STRIDE = 160;
53 const int NUM_MFCC_VECTORS = 296;
54 int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS - 1) * MFCC_WINDOW_STRIDE);
55 int MEL_LO_FREQ = 0;
56 int MEL_HI_FREQ = 8000;
57 int NUM_FBANK_BIN = 128;
58 int INPUT_WINDOW_LEFT_CONTEXT = 98;
59 int INPUT_WINDOW_RIGHT_CONTEXT = 98;
60 int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
61 (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
62 int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
63
64
65 MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
66 MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
67
68 std::unique_ptr<Wav2LetterMFCC> mfccInst = std::make_unique<Wav2LetterMFCC>(mfccParams);
69
70 auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(config.m_ModelFilePath,
71 config.m_backends);
72
73 auto decoder = std::make_unique<asr::Decoder>(labels);
74
75 auto preprocessor = std::make_unique<Wav2LetterPreprocessor>(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE,
76 std::move(mfccInst));
77
78 auto ptr = std::make_unique<asr::ASRPipeline>(
79 std::move(executor), std::move(decoder), std::move(preprocessor));
80
81 ptr->SLIDING_WINDOW_OFFSET = SLIDING_WINDOW_OFFSET;
82
83 return ptr;
84 }
85 else
86 {
87 throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " .");
88 }
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +010089}
90
91}// namespace asr