blob: 704bc368d2bbbda08d8d682c0dba57a0a14a612d [file] [log] [blame]
telsoa014fcda012018-03-09 14:13:49 +00001//
2// Copyright © 2017 Arm Ltd. All rights reserved.
David Beckecb56cd2018-09-05 12:52:57 +01003// SPDX-License-Identifier: MIT
telsoa014fcda012018-03-09 14:13:49 +00004//
5
6#pragma once
7
8#include "RefWorkloadUtils.hpp"
narpra015f703182018-10-26 16:24:58 +01009#include "TensorBufferArrayView.hpp"
telsoa014fcda012018-03-09 14:13:49 +000010
11#include <armnn/Tensor.hpp>
12
Matteo Martincigh21350152018-11-28 16:22:22 +000013#include <DataLayoutIndexed.hpp>
Matthew Bentham8800c002018-11-19 13:19:28 +000014
telsoa014fcda012018-03-09 14:13:49 +000015#include <boost/assert.hpp>
16#include <boost/numeric/conversion/cast.hpp>
17
Matteo Martincigh46315822018-11-28 16:22:36 +000018#include <DataLayoutIndexed.hpp>
19
telsoa014fcda012018-03-09 14:13:49 +000020#include <cmath>
21#include <limits>
22
23namespace armnn
24{
25
telsoa01c577f2c2018-08-31 09:22:23 +010026/// Performs multiplication of an integer with a multiplier which is less than one,
telsoa014fcda012018-03-09 14:13:49 +000027/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
28struct QuantizedMultiplierSmallerThanOne
29{
30public:
31 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
32 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
33 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
34 QuantizedMultiplierSmallerThanOne(float multiplier);
35
telsoa01c577f2c2018-08-31 09:22:23 +010036 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
telsoa014fcda012018-03-09 14:13:49 +000037 int32_t operator*(int32_t rhs) const;
38
39private:
telsoa01c577f2c2018-08-31 09:22:23 +010040 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
telsoa014fcda012018-03-09 14:13:49 +000041 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
42
telsoa01c577f2c2018-08-31 09:22:23 +010043 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
telsoa014fcda012018-03-09 14:13:49 +000044 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
45
46 int32_t m_Multiplier;
47 int32_t m_RightShift;
48};
49
telsoa01c577f2c2018-08-31 09:22:23 +010050/// An implementation shared by normal and depthwise convolution.
telsoa014fcda012018-03-09 14:13:49 +000051template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
52static void ConvImpl(ConvData data,
53 const InputType* inputData,
54 float inputScale,
55 int32_t inputOffset,
56 const InputType* filterData,
57 float filterScale,
58 int32_t filterOffset,
59 const BiasType* biasData,
60 InputType* outputData,
61 float outputScale,
62 int32_t outputOffset,
telsoa01c577f2c2018-08-31 09:22:23 +010063 const TensorInfo& filterInfo,
telsoa014fcda012018-03-09 14:13:49 +000064 bool depthwise = false)
65{
surmeh013537c2c2018-05-18 16:31:43 +010066 if (data.m_Parameters.m_BiasEnabled && !biasData)
67 {
68 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
69 }
70
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010071 const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000072 const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000073
narpra015f703182018-10-26 16:24:58 +010074 TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
75 GetOutputTensorData<InputType>(0, data),
76 data.m_Parameters.m_DataLayout);
77
Matteo Martincigh21350152018-11-28 16:22:22 +000078 const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
Matteo Martincigh46315822018-11-28 16:22:36 +000079
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010080 const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
81 const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
82 const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
83
telsoa014fcda012018-03-09 14:13:49 +000084 unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010085 unsigned int channelsInput = filterInfo.GetShape()[channelsIndex];
telsoa014fcda012018-03-09 14:13:49 +000086 unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
87
telsoa014fcda012018-03-09 14:13:49 +000088 unsigned int batchSize = outputInfo0.GetShape()[0];
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010089 unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
90 unsigned int widthOutput = outputInfo0.GetShape()[widthIndex];
91 unsigned int heightInput = inputInfo0.GetShape()[heightIndex];
92 unsigned int widthInput = inputInfo0.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000093
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010094 unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
95 unsigned int widthFilter = filterInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000096
Matteo Martincigh46315822018-11-28 16:22:36 +000097 unsigned int paddingTop = data.m_Parameters.m_PadTop;
telsoa014fcda012018-03-09 14:13:49 +000098 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
Matteo Martincigh46315822018-11-28 16:22:36 +000099 unsigned int xStride = data.m_Parameters.m_StrideX;
100 unsigned int yStride = data.m_Parameters.m_StrideY;
telsoa014fcda012018-03-09 14:13:49 +0000101
telsoa01c577f2c2018-08-31 09:22:23 +0100102 // The world's least efficient convolution.
telsoa014fcda012018-03-09 14:13:49 +0000103 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
104 {
105 for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
106 {
107 for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
108 {
109 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
110 {
telsoa01c577f2c2018-08-31 09:22:23 +0100111 // This loop goes over each output element.
telsoa014fcda012018-03-09 14:13:49 +0000112 AccumulatorType sum = AccumulatorType();
113
telsoa01c577f2c2018-08-31 09:22:23 +0100114 // For depthwise, each output channel corresponds to exactly one input channel.
115 // For normal, must loop over each input channel.
telsoa014fcda012018-03-09 14:13:49 +0000116 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
117 {
118 unsigned int depthwiseMultiplierIdx = 0;
119 if (depthwise)
120 {
121 cInput = cOutput / depthMult;
122 depthwiseMultiplierIdx = cOutput % depthMult;
123 }
124
125 for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
126 {
127 for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
128 {
telsoa01c577f2c2018-08-31 09:22:23 +0100129 // This loop goes over each input element for each output element.
telsoa014fcda012018-03-09 14:13:49 +0000130
131 unsigned int filterIndex;
132
telsoa01c577f2c2018-08-31 09:22:23 +0100133 // Since dimensionality of kernel depends on depthwiseness, so does index.
telsoa014fcda012018-03-09 14:13:49 +0000134 if (depthwise)
135 {
narpra015f703182018-10-26 16:24:58 +0100136 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
137 {
138 filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
139 * channelsInput +
140 yFilter * widthFilter * channelsInput +
141 xFilter * channelsInput +
142 cInput;
143 }
144 else
145 {
146 filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
147 * channelsInput +
148 cInput * widthFilter * heightFilter +
149 yFilter * widthFilter +
150 xFilter;
151 }
telsoa014fcda012018-03-09 14:13:49 +0000152 }
153 else
154 {
narpra015f703182018-10-26 16:24:58 +0100155 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
156 {
157 filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
158 yFilter * widthFilter * channelsInput +
159 xFilter * channelsInput +
160 cInput;
161 }
162 else
163 {
164 filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
165 cInput * widthFilter * heightFilter +
166 yFilter * widthFilter +
167 xFilter;
168 }
telsoa014fcda012018-03-09 14:13:49 +0000169 }
narpra015f703182018-10-26 16:24:58 +0100170
telsoa014fcda012018-03-09 14:13:49 +0000171 AccumulatorType filterValue = filterData[filterIndex] -
172 boost::numeric_cast<AccumulatorType>(filterOffset);
173
Matteo Martincigh46315822018-11-28 16:22:36 +0000174 unsigned int yInput = yOutput * yStride + yFilter;
telsoa014fcda012018-03-09 14:13:49 +0000175 unsigned int xInput = xOutput * xStride + xFilter;
176
177 AccumulatorType inputValue;
178
telsoa01c577f2c2018-08-31 09:22:23 +0100179 // Check if we're in the padding.
telsoa014fcda012018-03-09 14:13:49 +0000180 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
181 xInput < paddingLeft || xInput >= widthInput + paddingLeft )
182 {
183 inputValue = AccumulatorType();
184 }
185 else
186 {
narpra015f703182018-10-26 16:24:58 +0100187 unsigned int inputIndex;
188
189 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
190 {
191 inputIndex = batchIdx * heightInput * widthInput * channelsInput +
192 (yInput - paddingTop) * widthInput * channelsInput +
193 (xInput - paddingLeft) * channelsInput +
194 cInput;
195
196 }
197 else
198 {
199 inputIndex = batchIdx * widthInput * heightInput * channelsInput +
200 widthInput * heightInput * cInput +
201 widthInput * (yInput - paddingTop) +
202 xInput - paddingLeft;
203 }
204
205 inputValue = inputData[inputIndex] -
206 boost::numeric_cast<AccumulatorType>(inputOffset);
207
telsoa014fcda012018-03-09 14:13:49 +0000208 }
209 sum += filterValue * inputValue;
210 }
211 }
212 }
213
214 if (data.m_Parameters.m_BiasEnabled)
215 {
216 sum += biasData[cOutput];
217 }
218
219 if (outputScale != 0.0f)
220 {
221 float multiplier = (inputScale * filterScale) / outputScale;
222 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
223 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
224 // sum = std::round(multiplier * sum + outputOffset);
225 sum = boost::numeric_cast<AccumulatorType>(
226 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
227 + boost::numeric_cast<AccumulatorType>(outputOffset);
228 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
229 }
230
narpra015f703182018-10-26 16:24:58 +0100231 output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
telsoa014fcda012018-03-09 14:13:49 +0000232 }
233 }
234 }
235 }
236}
237
238} //namespace armnn