blob: 4b15c1da6d70f9db95edd30c479e7df9296c069c [file] [log] [blame]
telsoa014fcda012018-03-09 14:13:49 +00001//
2// Copyright © 2017 Arm Ltd. All rights reserved.
David Beckecb56cd2018-09-05 12:52:57 +01003// SPDX-License-Identifier: MIT
telsoa014fcda012018-03-09 14:13:49 +00004//
5
6#pragma once
7
8#include "RefWorkloadUtils.hpp"
narpra015f703182018-10-26 16:24:58 +01009#include "TensorBufferArrayView.hpp"
telsoa014fcda012018-03-09 14:13:49 +000010
11#include <armnn/Tensor.hpp>
12
13#include <boost/assert.hpp>
14#include <boost/numeric/conversion/cast.hpp>
15
16#include <cmath>
17#include <limits>
18
19namespace armnn
20{
21
telsoa01c577f2c2018-08-31 09:22:23 +010022/// Performs multiplication of an integer with a multiplier which is less than one,
telsoa014fcda012018-03-09 14:13:49 +000023/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
24struct QuantizedMultiplierSmallerThanOne
25{
26public:
27 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
28 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
29 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
30 QuantizedMultiplierSmallerThanOne(float multiplier);
31
telsoa01c577f2c2018-08-31 09:22:23 +010032 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
telsoa014fcda012018-03-09 14:13:49 +000033 int32_t operator*(int32_t rhs) const;
34
35private:
telsoa01c577f2c2018-08-31 09:22:23 +010036 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
telsoa014fcda012018-03-09 14:13:49 +000037 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
38
telsoa01c577f2c2018-08-31 09:22:23 +010039 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
telsoa014fcda012018-03-09 14:13:49 +000040 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
41
42 int32_t m_Multiplier;
43 int32_t m_RightShift;
44};
45
telsoa01c577f2c2018-08-31 09:22:23 +010046/// An implementation shared by normal and depthwise convolution.
telsoa014fcda012018-03-09 14:13:49 +000047template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
48static void ConvImpl(ConvData data,
49 const InputType* inputData,
50 float inputScale,
51 int32_t inputOffset,
52 const InputType* filterData,
53 float filterScale,
54 int32_t filterOffset,
55 const BiasType* biasData,
56 InputType* outputData,
57 float outputScale,
58 int32_t outputOffset,
telsoa01c577f2c2018-08-31 09:22:23 +010059 const TensorInfo& filterInfo,
telsoa014fcda012018-03-09 14:13:49 +000060 bool depthwise = false)
61{
surmeh013537c2c2018-05-18 16:31:43 +010062 if (data.m_Parameters.m_BiasEnabled && !biasData)
63 {
64 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
65 }
66
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010067 const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000068 const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000069
narpra015f703182018-10-26 16:24:58 +010070 TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
71 GetOutputTensorData<InputType>(0, data),
72 data.m_Parameters.m_DataLayout);
73
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010074 const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
75 const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
76 const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
77 const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
78
telsoa014fcda012018-03-09 14:13:49 +000079 unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010080 unsigned int channelsInput = filterInfo.GetShape()[channelsIndex];
telsoa014fcda012018-03-09 14:13:49 +000081 unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
82
telsoa014fcda012018-03-09 14:13:49 +000083 unsigned int batchSize = outputInfo0.GetShape()[0];
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010084 unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
85 unsigned int widthOutput = outputInfo0.GetShape()[widthIndex];
86 unsigned int heightInput = inputInfo0.GetShape()[heightIndex];
87 unsigned int widthInput = inputInfo0.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000088
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010089 unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
90 unsigned int widthFilter = filterInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000091
92 unsigned int paddingTop = data.m_Parameters.m_PadTop;
93 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
94 unsigned int hStride = data.m_Parameters.m_StrideY;
95 unsigned int xStride = data.m_Parameters.m_StrideX;
96
telsoa01c577f2c2018-08-31 09:22:23 +010097 // The world's least efficient convolution.
telsoa014fcda012018-03-09 14:13:49 +000098 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
99 {
100 for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
101 {
102 for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
103 {
104 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
105 {
telsoa01c577f2c2018-08-31 09:22:23 +0100106 // This loop goes over each output element.
telsoa014fcda012018-03-09 14:13:49 +0000107 AccumulatorType sum = AccumulatorType();
108
telsoa01c577f2c2018-08-31 09:22:23 +0100109 // For depthwise, each output channel corresponds to exactly one input channel.
110 // For normal, must loop over each input channel.
telsoa014fcda012018-03-09 14:13:49 +0000111 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
112 {
113 unsigned int depthwiseMultiplierIdx = 0;
114 if (depthwise)
115 {
116 cInput = cOutput / depthMult;
117 depthwiseMultiplierIdx = cOutput % depthMult;
118 }
119
120 for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
121 {
122 for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
123 {
telsoa01c577f2c2018-08-31 09:22:23 +0100124 // This loop goes over each input element for each output element.
telsoa014fcda012018-03-09 14:13:49 +0000125
126 unsigned int filterIndex;
127
telsoa01c577f2c2018-08-31 09:22:23 +0100128 // Since dimensionality of kernel depends on depthwiseness, so does index.
telsoa014fcda012018-03-09 14:13:49 +0000129 if (depthwise)
130 {
narpra015f703182018-10-26 16:24:58 +0100131 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
132 {
133 filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
134 * channelsInput +
135 yFilter * widthFilter * channelsInput +
136 xFilter * channelsInput +
137 cInput;
138 }
139 else
140 {
141 filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
142 * channelsInput +
143 cInput * widthFilter * heightFilter +
144 yFilter * widthFilter +
145 xFilter;
146 }
telsoa014fcda012018-03-09 14:13:49 +0000147 }
148 else
149 {
narpra015f703182018-10-26 16:24:58 +0100150 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
151 {
152 filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
153 yFilter * widthFilter * channelsInput +
154 xFilter * channelsInput +
155 cInput;
156 }
157 else
158 {
159 filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
160 cInput * widthFilter * heightFilter +
161 yFilter * widthFilter +
162 xFilter;
163 }
telsoa014fcda012018-03-09 14:13:49 +0000164 }
narpra015f703182018-10-26 16:24:58 +0100165
telsoa014fcda012018-03-09 14:13:49 +0000166 AccumulatorType filterValue = filterData[filterIndex] -
167 boost::numeric_cast<AccumulatorType>(filterOffset);
168
169 unsigned int yInput = yOutput * hStride + yFilter;
170 unsigned int xInput = xOutput * xStride + xFilter;
171
172 AccumulatorType inputValue;
173
telsoa01c577f2c2018-08-31 09:22:23 +0100174 // Check if we're in the padding.
telsoa014fcda012018-03-09 14:13:49 +0000175 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
176 xInput < paddingLeft || xInput >= widthInput + paddingLeft )
177 {
178 inputValue = AccumulatorType();
179 }
180 else
181 {
narpra015f703182018-10-26 16:24:58 +0100182 unsigned int inputIndex;
183
184 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
185 {
186 inputIndex = batchIdx * heightInput * widthInput * channelsInput +
187 (yInput - paddingTop) * widthInput * channelsInput +
188 (xInput - paddingLeft) * channelsInput +
189 cInput;
190
191 }
192 else
193 {
194 inputIndex = batchIdx * widthInput * heightInput * channelsInput +
195 widthInput * heightInput * cInput +
196 widthInput * (yInput - paddingTop) +
197 xInput - paddingLeft;
198 }
199
200 inputValue = inputData[inputIndex] -
201 boost::numeric_cast<AccumulatorType>(inputOffset);
202
telsoa014fcda012018-03-09 14:13:49 +0000203 }
204 sum += filterValue * inputValue;
205 }
206 }
207 }
208
209 if (data.m_Parameters.m_BiasEnabled)
210 {
211 sum += biasData[cOutput];
212 }
213
214 if (outputScale != 0.0f)
215 {
216 float multiplier = (inputScale * filterScale) / outputScale;
217 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
218 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
219 // sum = std::round(multiplier * sum + outputOffset);
220 sum = boost::numeric_cast<AccumulatorType>(
221 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
222 + boost::numeric_cast<AccumulatorType>(outputOffset);
223 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
224 }
225
narpra015f703182018-10-26 16:24:58 +0100226 output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
telsoa014fcda012018-03-09 14:13:49 +0000227 }
228 }
229 }
230 }
231}
232
233} //namespace armnn