blob: 5c07f57ec0e9719e4efb95e4e2a9479542e6b119 [file] [log] [blame]
telsoa014fcda012018-03-09 14:13:49 +00001//
2// Copyright © 2017 Arm Ltd. All rights reserved.
David Beckecb56cd2018-09-05 12:52:57 +01003// SPDX-License-Identifier: MIT
telsoa014fcda012018-03-09 14:13:49 +00004//
5
6#pragma once
7
8#include "RefWorkloadUtils.hpp"
narpra015f703182018-10-26 16:24:58 +01009#include "TensorBufferArrayView.hpp"
telsoa014fcda012018-03-09 14:13:49 +000010
11#include <armnn/Tensor.hpp>
12
Matteo Martincigh21350152018-11-28 16:22:22 +000013#include <DataLayoutIndexed.hpp>
Matthew Bentham8800c002018-11-19 13:19:28 +000014
telsoa014fcda012018-03-09 14:13:49 +000015#include <boost/assert.hpp>
16#include <boost/numeric/conversion/cast.hpp>
17
Matteo Martincigh46315822018-11-28 16:22:36 +000018#include <DataLayoutIndexed.hpp>
19
telsoa014fcda012018-03-09 14:13:49 +000020#include <cmath>
21#include <limits>
22
23namespace armnn
24{
25
telsoa01c577f2c2018-08-31 09:22:23 +010026/// Performs multiplication of an integer with a multiplier which is less than one,
telsoa014fcda012018-03-09 14:13:49 +000027/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
28struct QuantizedMultiplierSmallerThanOne
29{
30public:
31 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
32 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
33 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
34 QuantizedMultiplierSmallerThanOne(float multiplier);
35
telsoa01c577f2c2018-08-31 09:22:23 +010036 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
telsoa014fcda012018-03-09 14:13:49 +000037 int32_t operator*(int32_t rhs) const;
38
39private:
telsoa01c577f2c2018-08-31 09:22:23 +010040 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
telsoa014fcda012018-03-09 14:13:49 +000041 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
42
telsoa01c577f2c2018-08-31 09:22:23 +010043 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
telsoa014fcda012018-03-09 14:13:49 +000044 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
45
46 int32_t m_Multiplier;
47 int32_t m_RightShift;
48};
49
telsoa01c577f2c2018-08-31 09:22:23 +010050/// An implementation shared by normal and depthwise convolution.
telsoa014fcda012018-03-09 14:13:49 +000051template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
52static void ConvImpl(ConvData data,
53 const InputType* inputData,
54 float inputScale,
55 int32_t inputOffset,
56 const InputType* filterData,
57 float filterScale,
58 int32_t filterOffset,
59 const BiasType* biasData,
telsoa014fcda012018-03-09 14:13:49 +000060 float outputScale,
61 int32_t outputOffset,
telsoa01c577f2c2018-08-31 09:22:23 +010062 const TensorInfo& filterInfo,
telsoa014fcda012018-03-09 14:13:49 +000063 bool depthwise = false)
64{
surmeh013537c2c2018-05-18 16:31:43 +010065 if (data.m_Parameters.m_BiasEnabled && !biasData)
66 {
67 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
68 }
69
Matteo Martincigh747ef822018-12-18 09:26:39 +000070 const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]);
71 const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000072
Matteo Martincigh747ef822018-12-18 09:26:39 +000073 TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
narpra015f703182018-10-26 16:24:58 +010074 GetOutputTensorData<InputType>(0, data),
75 data.m_Parameters.m_DataLayout);
76
Matteo Martincigh21350152018-11-28 16:22:22 +000077 const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
Matteo Martincigh46315822018-11-28 16:22:36 +000078
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010079 const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
80 const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
81 const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
82
Matteo Martincigh747ef822018-12-18 09:26:39 +000083 unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
84 unsigned int inputChannels = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
85 unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
telsoa014fcda012018-03-09 14:13:49 +000086
Matteo Martincigh747ef822018-12-18 09:26:39 +000087 unsigned int batchSize = outputInfo.GetShape()[0];
88 unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
89 unsigned int outputWidth = outputInfo.GetShape()[widthIndex];
90 unsigned int inputHeight = inputInfo.GetShape()[heightIndex];
91 unsigned int inputWidth = inputInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000092
Matteo Martincigh747ef822018-12-18 09:26:39 +000093 unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
94 unsigned int filterWidth = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000095
Matteo Martincigh46315822018-11-28 16:22:36 +000096 unsigned int paddingTop = data.m_Parameters.m_PadTop;
telsoa014fcda012018-03-09 14:13:49 +000097 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
Matteo Martincigh46315822018-11-28 16:22:36 +000098 unsigned int xStride = data.m_Parameters.m_StrideX;
99 unsigned int yStride = data.m_Parameters.m_StrideY;
telsoa014fcda012018-03-09 14:13:49 +0000100
telsoa01c577f2c2018-08-31 09:22:23 +0100101 // The world's least efficient convolution.
telsoa014fcda012018-03-09 14:13:49 +0000102 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
103 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000104 for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
telsoa014fcda012018-03-09 14:13:49 +0000105 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000106 for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
telsoa014fcda012018-03-09 14:13:49 +0000107 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000108 for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
telsoa014fcda012018-03-09 14:13:49 +0000109 {
telsoa01c577f2c2018-08-31 09:22:23 +0100110 // This loop goes over each output element.
telsoa014fcda012018-03-09 14:13:49 +0000111 AccumulatorType sum = AccumulatorType();
112
telsoa01c577f2c2018-08-31 09:22:23 +0100113 // For depthwise, each output channel corresponds to exactly one input channel.
114 // For normal, must loop over each input channel.
Matteo Martincigh747ef822018-12-18 09:26:39 +0000115 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
telsoa014fcda012018-03-09 14:13:49 +0000116 {
117 unsigned int depthwiseMultiplierIdx = 0;
118 if (depthwise)
119 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000120 cInput = cOutput / depthMultiplier;
121 depthwiseMultiplierIdx = cOutput % depthMultiplier;
telsoa014fcda012018-03-09 14:13:49 +0000122 }
123
Matteo Martincigh747ef822018-12-18 09:26:39 +0000124 for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
telsoa014fcda012018-03-09 14:13:49 +0000125 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000126 for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
telsoa014fcda012018-03-09 14:13:49 +0000127 {
telsoa01c577f2c2018-08-31 09:22:23 +0100128 // This loop goes over each input element for each output element.
telsoa014fcda012018-03-09 14:13:49 +0000129
Matteo Martincigh747ef822018-12-18 09:26:39 +0000130 unsigned int filterIndex = 0;
telsoa014fcda012018-03-09 14:13:49 +0000131
telsoa01c577f2c2018-08-31 09:22:23 +0100132 // Since dimensionality of kernel depends on depthwiseness, so does index.
telsoa014fcda012018-03-09 14:13:49 +0000133 if (depthwise)
134 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000135 filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
136 cInput * filterWidth * filterHeight +
137 yFilter * filterWidth +
138 xFilter;
telsoa014fcda012018-03-09 14:13:49 +0000139 }
140 else
141 {
narpra015f703182018-10-26 16:24:58 +0100142 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
143 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000144 filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
145 yFilter * filterWidth * inputChannels +
146 xFilter * inputChannels +
narpra015f703182018-10-26 16:24:58 +0100147 cInput;
148 }
149 else
150 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000151 filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
152 cInput * filterWidth * filterHeight +
153 yFilter * filterWidth +
narpra015f703182018-10-26 16:24:58 +0100154 xFilter;
155 }
telsoa014fcda012018-03-09 14:13:49 +0000156 }
narpra015f703182018-10-26 16:24:58 +0100157
telsoa014fcda012018-03-09 14:13:49 +0000158 AccumulatorType filterValue = filterData[filterIndex] -
159 boost::numeric_cast<AccumulatorType>(filterOffset);
160
Matteo Martincigh46315822018-11-28 16:22:36 +0000161 unsigned int yInput = yOutput * yStride + yFilter;
telsoa014fcda012018-03-09 14:13:49 +0000162 unsigned int xInput = xOutput * xStride + xFilter;
163
164 AccumulatorType inputValue;
165
telsoa01c577f2c2018-08-31 09:22:23 +0100166 // Check if we're in the padding.
Matteo Martincigh747ef822018-12-18 09:26:39 +0000167 if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
168 xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
telsoa014fcda012018-03-09 14:13:49 +0000169 {
170 inputValue = AccumulatorType();
171 }
172 else
173 {
narpra015f703182018-10-26 16:24:58 +0100174 unsigned int inputIndex;
175
176 if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
177 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000178 inputIndex = batchIdx * inputHeight * inputWidth * inputChannels +
179 (yInput - paddingTop) * inputWidth * inputChannels +
180 (xInput - paddingLeft) * inputChannels +
narpra015f703182018-10-26 16:24:58 +0100181 cInput;
182
183 }
184 else
185 {
Matteo Martincigh747ef822018-12-18 09:26:39 +0000186 inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
187 inputWidth * inputHeight * cInput +
188 inputWidth * (yInput - paddingTop) +
narpra015f703182018-10-26 16:24:58 +0100189 xInput - paddingLeft;
190 }
191
192 inputValue = inputData[inputIndex] -
193 boost::numeric_cast<AccumulatorType>(inputOffset);
194
telsoa014fcda012018-03-09 14:13:49 +0000195 }
196 sum += filterValue * inputValue;
197 }
198 }
199 }
200
201 if (data.m_Parameters.m_BiasEnabled)
202 {
203 sum += biasData[cOutput];
204 }
205
206 if (outputScale != 0.0f)
207 {
208 float multiplier = (inputScale * filterScale) / outputScale;
209 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
210 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
211 // sum = std::round(multiplier * sum + outputOffset);
212 sum = boost::numeric_cast<AccumulatorType>(
213 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
214 + boost::numeric_cast<AccumulatorType>(outputOffset);
215 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
216 }
217
narpra015f703182018-10-26 16:24:58 +0100218 output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
telsoa014fcda012018-03-09 14:13:49 +0000219 }
220 }
221 }
222 }
223}
224
225} //namespace armnn