blob: de37e23b404644f88403c2cdf810f7fd060d7492 [file] [log] [blame]
Éanna Ó Catháinc6ab02a2021-04-07 14:35:25 +01001//
2// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
3// SPDX-License-Identifier: MIT
4//
5#include <iostream>
6#include <map>
7#include <vector>
8#include <algorithm>
9#include <cmath>
10
11#include "CmdArgsParser.hpp"
12#include "ArmnnNetworkExecutor.hpp"
13#include "AudioCapture.hpp"
14#include "Preprocess.hpp"
15#include "Decoder.hpp"
16#include "SpeechRecognitionPipeline.hpp"
17
18
19using InferenceResult = std::vector<int8_t>;
20using InferenceResults = std::vector<InferenceResult>;
21
22const std::string AUDIO_FILE_PATH = "--audio-file-path";
23const std::string MODEL_FILE_PATH = "--model-file-path";
24const std::string LABEL_PATH = "--label-path";
25const std::string PREFERRED_BACKENDS = "--preferred-backends";
26const std::string HELP = "--help";
27
28std::map<int, std::string> labels = {
29 {0, "a" },
30 {1, "b" },
31 {2, "c" },
32 {3, "d" },
33 {4, "e" },
34 {5, "f" },
35 {6, "g" },
36 {7, "h" },
37 {8, "i" },
38 {9, "j" },
39 {10,"k" },
40 {11,"l" },
41 {12,"m" },
42 {13,"n" },
43 {14,"o" },
44 {15,"p" },
45 {16,"q" },
46 {17,"r" },
47 {18,"s" },
48 {19,"t" },
49 {20,"u" },
50 {21,"v" },
51 {22,"w" },
52 {23,"x" },
53 {24,"y" },
54 {25,"z" },
55 {26, "\'" },
56 {27, " "},
57 {28,"$" }
58};
59
60/*
61 * The accepted options for this Speech Recognition executable
62 */
63static std::map<std::string, std::string> CMD_OPTIONS = {
64 {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"},
65 {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"},
66 {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
67 " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
68 " Defaults to CpuAcc,CpuRef"}
69};
70
71/*
72 * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
73 */
74std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
75{
76 std::vector<armnn::BackendId> backends;
77 std::stringstream ss(preferredBackends);
78
79 while(ss.good())
80 {
81 std::string backend;
82 std::getline( ss, backend, ',' );
83 backends.emplace_back(backend);
84 }
85 return backends;
86}
87
88int main(int argc, char *argv[])
89{
90 // Wav2Letter ASR SETTINGS
91 int SAMP_FREQ = 16000;
92 int FRAME_LEN_MS = 32;
93 int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001;
94 int NUM_MFCC_FEATS = 13;
95 int MFCC_WINDOW_LEN = 512;
96 int MFCC_WINDOW_STRIDE = 160;
97 const int NUM_MFCC_VECTORS = 296;
98 int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE);
99 int MEL_LO_FREQ = 0;
100 int MEL_HI_FREQ = 8000;
101 int NUM_FBANK_BIN = 128;
102 int INPUT_WINDOW_LEFT_CONTEXT = 98;
103 int INPUT_WINDOW_RIGHT_CONTEXT = 98;
104 int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
105 (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
106 int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
107
108
109 MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
110 MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
111
112 MFCC mfccInst = MFCC(mfccParams);
113
114 Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst);
115
116 bool isFirstWindow = true;
117 std::string currentRContext = "";
118
119 std::map <std::string, std::string> options;
120
121 int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
122 if (result != 0)
123 {
124 return result;
125 }
126
127 // Create the network options
128 common::PipelineOptions pipelineOptions;
129 pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
130
131 if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
132 {
133 pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
134 }
135 else
136 {
137 pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
138 }
139
140 asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels);
141
142 asr::AudioCapture capture;
143 std::vector<float> audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
144 capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET);
145
146 while (capture.HasNext())
147 {
148 std::vector<float> audioBlock = capture.Next();
149 InferenceResults results;
150
151 std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing<float, int8_t>(audioBlock, preprocessor);
152 asrPipeline->Inference<int8_t>(preprocessedData, results);
153 asrPipeline->PostProcessing<int8_t>(results, isFirstWindow, !capture.HasNext(), currentRContext);
154 }
155
156 return 0;
157}