blob: 0b9f8f70c801d693b6798842325374fd78bf3a94 [file] [log] [blame]
telsoa014fcda012018-03-09 14:13:49 +00001//
2// Copyright © 2017 Arm Ltd. All rights reserved.
David Beckecb56cd2018-09-05 12:52:57 +01003// SPDX-License-Identifier: MIT
telsoa014fcda012018-03-09 14:13:49 +00004//
5
6#pragma once
7
8#include "RefWorkloadUtils.hpp"
narpra015f703182018-10-26 16:24:58 +01009#include "TensorBufferArrayView.hpp"
telsoa014fcda012018-03-09 14:13:49 +000010
11#include <armnn/Tensor.hpp>
12
Matthew Bentham8800c002018-11-19 13:19:28 +000013#include <backendsCommon/DataLayoutIndexed.hpp>
14
telsoa014fcda012018-03-09 14:13:49 +000015#include <boost/assert.hpp>
16#include <boost/numeric/conversion/cast.hpp>
17
18#include <cmath>
19#include <limits>
20
21namespace armnn
22{
23
telsoa01c577f2c2018-08-31 09:22:23 +010024/// Performs multiplication of an integer with a multiplier which is less than one,
telsoa014fcda012018-03-09 14:13:49 +000025/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
26struct QuantizedMultiplierSmallerThanOne
27{
28public:
29 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
30 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
31 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
32 QuantizedMultiplierSmallerThanOne(float multiplier);
33
telsoa01c577f2c2018-08-31 09:22:23 +010034 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
telsoa014fcda012018-03-09 14:13:49 +000035 int32_t operator*(int32_t rhs) const;
36
37private:
telsoa01c577f2c2018-08-31 09:22:23 +010038 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
telsoa014fcda012018-03-09 14:13:49 +000039 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
40
telsoa01c577f2c2018-08-31 09:22:23 +010041 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
telsoa014fcda012018-03-09 14:13:49 +000042 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
43
44 int32_t m_Multiplier;
45 int32_t m_RightShift;
46};
47
telsoa01c577f2c2018-08-31 09:22:23 +010048/// An implementation shared by normal and depthwise convolution.
telsoa014fcda012018-03-09 14:13:49 +000049template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
50static void ConvImpl(ConvData data,
51 const InputType* inputData,
52 float inputScale,
53 int32_t inputOffset,
54 const InputType* filterData,
55 float filterScale,
56 int32_t filterOffset,
57 const BiasType* biasData,
58 InputType* outputData,
59 float outputScale,
60 int32_t outputOffset,
telsoa01c577f2c2018-08-31 09:22:23 +010061 const TensorInfo& filterInfo,
telsoa014fcda012018-03-09 14:13:49 +000062 bool depthwise = false)
63{
surmeh013537c2c2018-05-18 16:31:43 +010064 if (data.m_Parameters.m_BiasEnabled && !biasData)
65 {
66 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
67 }
68
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010069 const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000070 const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000071
narpra015f703182018-10-26 16:24:58 +010072 TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
73 GetOutputTensorData<InputType>(0, data),
74 data.m_Parameters.m_DataLayout);
75
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010076 const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
77 const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
78 const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
79 const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
80
telsoa014fcda012018-03-09 14:13:49 +000081 unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010082 unsigned int channelsInput = filterInfo.GetShape()[channelsIndex];
telsoa014fcda012018-03-09 14:13:49 +000083 unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
84
telsoa014fcda012018-03-09 14:13:49 +000085 unsigned int batchSize = outputInfo0.GetShape()[0];
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010086 unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
87 unsigned int widthOutput = outputInfo0.GetShape()[widthIndex];
88 unsigned int heightInput = inputInfo0.GetShape()[heightIndex];
89 unsigned int widthInput = inputInfo0.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000090
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010091 unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
92 unsigned int widthFilter = filterInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000093
94 unsigned int paddingTop = data.m_Parameters.m_PadTop;
95 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
96 unsigned int hStride = data.m_Parameters.m_StrideY;
97 unsigned int xStride = data.m_Parameters.m_StrideX;
98
telsoa01c577f2c2018-08-31 09:22:23 +010099 // The world's least efficient convolution.
telsoa014fcda012018-03-09 14:13:49 +0000100 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
101 {
102 for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
103 {
104 for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
105 {
106 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
107 {
telsoa01c577f2c2018-08-31 09:22:23 +0100108 // This loop goes over each output element.
telsoa014fcda012018-03-09 14:13:49 +0000109 AccumulatorType sum = AccumulatorType();
110
telsoa01c577f2c2018-08-31 09:22:23 +0100111 // For depthwise, each output channel corresponds to exactly one input channel.
112 // For normal, must loop over each input channel.
telsoa014fcda012018-03-09 14:13:49 +0000113 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
114 {
115 unsigned int depthwiseMultiplierIdx = 0;
116 if (depthwise)
117 {
118 cInput = cOutput / depthMult;
119 depthwiseMultiplierIdx = cOutput % depthMult;
120 }
121
122 for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
123 {
124 for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
125 {
telsoa01c577f2c2018-08-31 09:22:23 +0100126 // This loop goes over each input element for each output element.
telsoa014fcda012018-03-09 14:13:49 +0000127
128 unsigned int filterIndex;
129
telsoa01c577f2c2018-08-31 09:22:23 +0100130 // Since dimensionality of kernel depends on depthwiseness, so does index.
telsoa014fcda012018-03-09 14:13:49 +0000131 if (depthwise)
132 {
narpra015f703182018-10-26 16:24:58 +0100133 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
134 {
135 filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
136 * channelsInput +
137 yFilter * widthFilter * channelsInput +
138 xFilter * channelsInput +
139 cInput;
140 }
141 else
142 {
143 filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
144 * channelsInput +
145 cInput * widthFilter * heightFilter +
146 yFilter * widthFilter +
147 xFilter;
148 }
telsoa014fcda012018-03-09 14:13:49 +0000149 }
150 else
151 {
narpra015f703182018-10-26 16:24:58 +0100152 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
153 {
154 filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
155 yFilter * widthFilter * channelsInput +
156 xFilter * channelsInput +
157 cInput;
158 }
159 else
160 {
161 filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
162 cInput * widthFilter * heightFilter +
163 yFilter * widthFilter +
164 xFilter;
165 }
telsoa014fcda012018-03-09 14:13:49 +0000166 }
narpra015f703182018-10-26 16:24:58 +0100167
telsoa014fcda012018-03-09 14:13:49 +0000168 AccumulatorType filterValue = filterData[filterIndex] -
169 boost::numeric_cast<AccumulatorType>(filterOffset);
170
171 unsigned int yInput = yOutput * hStride + yFilter;
172 unsigned int xInput = xOutput * xStride + xFilter;
173
174 AccumulatorType inputValue;
175
telsoa01c577f2c2018-08-31 09:22:23 +0100176 // Check if we're in the padding.
telsoa014fcda012018-03-09 14:13:49 +0000177 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
178 xInput < paddingLeft || xInput >= widthInput + paddingLeft )
179 {
180 inputValue = AccumulatorType();
181 }
182 else
183 {
narpra015f703182018-10-26 16:24:58 +0100184 unsigned int inputIndex;
185
186 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
187 {
188 inputIndex = batchIdx * heightInput * widthInput * channelsInput +
189 (yInput - paddingTop) * widthInput * channelsInput +
190 (xInput - paddingLeft) * channelsInput +
191 cInput;
192
193 }
194 else
195 {
196 inputIndex = batchIdx * widthInput * heightInput * channelsInput +
197 widthInput * heightInput * cInput +
198 widthInput * (yInput - paddingTop) +
199 xInput - paddingLeft;
200 }
201
202 inputValue = inputData[inputIndex] -
203 boost::numeric_cast<AccumulatorType>(inputOffset);
204
telsoa014fcda012018-03-09 14:13:49 +0000205 }
206 sum += filterValue * inputValue;
207 }
208 }
209 }
210
211 if (data.m_Parameters.m_BiasEnabled)
212 {
213 sum += biasData[cOutput];
214 }
215
216 if (outputScale != 0.0f)
217 {
218 float multiplier = (inputScale * filterScale) / outputScale;
219 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
220 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
221 // sum = std::round(multiplier * sum + outputOffset);
222 sum = boost::numeric_cast<AccumulatorType>(
223 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
224 + boost::numeric_cast<AccumulatorType>(outputOffset);
225 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
226 }
227
narpra015f703182018-10-26 16:24:58 +0100228 output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
telsoa014fcda012018-03-09 14:13:49 +0000229 }
230 }
231 }
232 }
233}
234
235} //namespace armnn