blob: 60a3622c5557a5e0ef4200a150fb25d9888c0c16 [file] [log] [blame]
telsoa014fcda012018-03-09 14:13:49 +00001//
2// Copyright © 2017 Arm Ltd. All rights reserved.
David Beckecb56cd2018-09-05 12:52:57 +01003// SPDX-License-Identifier: MIT
telsoa014fcda012018-03-09 14:13:49 +00004//
5
6#pragma once
7
8#include "RefWorkloadUtils.hpp"
9
10#include <armnn/Tensor.hpp>
11
12#include <boost/assert.hpp>
13#include <boost/numeric/conversion/cast.hpp>
14
15#include <cmath>
16#include <limits>
17
18namespace armnn
19{
20
telsoa01c577f2c2018-08-31 09:22:23 +010021/// Performs multiplication of an integer with a multiplier which is less than one,
telsoa014fcda012018-03-09 14:13:49 +000022/// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
23struct QuantizedMultiplierSmallerThanOne
24{
25public:
26 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
27 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
28 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
29 QuantizedMultiplierSmallerThanOne(float multiplier);
30
telsoa01c577f2c2018-08-31 09:22:23 +010031 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
telsoa014fcda012018-03-09 14:13:49 +000032 int32_t operator*(int32_t rhs) const;
33
34private:
telsoa01c577f2c2018-08-31 09:22:23 +010035 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
telsoa014fcda012018-03-09 14:13:49 +000036 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
37
telsoa01c577f2c2018-08-31 09:22:23 +010038 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
telsoa014fcda012018-03-09 14:13:49 +000039 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
40
41 int32_t m_Multiplier;
42 int32_t m_RightShift;
43};
44
telsoa01c577f2c2018-08-31 09:22:23 +010045/// An implementation shared by normal and depthwise convolution.
telsoa014fcda012018-03-09 14:13:49 +000046template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
47static void ConvImpl(ConvData data,
48 const InputType* inputData,
49 float inputScale,
50 int32_t inputOffset,
51 const InputType* filterData,
52 float filterScale,
53 int32_t filterOffset,
54 const BiasType* biasData,
55 InputType* outputData,
56 float outputScale,
57 int32_t outputOffset,
telsoa01c577f2c2018-08-31 09:22:23 +010058 const TensorInfo& filterInfo,
telsoa014fcda012018-03-09 14:13:49 +000059 bool depthwise = false)
60{
surmeh013537c2c2018-05-18 16:31:43 +010061 if (data.m_Parameters.m_BiasEnabled && !biasData)
62 {
63 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
64 }
65
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010066 const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000067 const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
telsoa014fcda012018-03-09 14:13:49 +000068
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010069 const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
70 const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
71 const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex();
72 const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex();
73
telsoa014fcda012018-03-09 14:13:49 +000074 unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010075 unsigned int channelsInput = filterInfo.GetShape()[channelsIndex];
telsoa014fcda012018-03-09 14:13:49 +000076 unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
77
telsoa014fcda012018-03-09 14:13:49 +000078 unsigned int batchSize = outputInfo0.GetShape()[0];
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010079 unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
80 unsigned int widthOutput = outputInfo0.GetShape()[widthIndex];
81 unsigned int heightInput = inputInfo0.GetShape()[heightIndex];
82 unsigned int widthInput = inputInfo0.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000083
Nikhil Raje4dfd6e2018-10-18 10:11:04 +010084 unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
85 unsigned int widthFilter = filterInfo.GetShape()[widthIndex];
telsoa014fcda012018-03-09 14:13:49 +000086
87 unsigned int paddingTop = data.m_Parameters.m_PadTop;
88 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
89 unsigned int hStride = data.m_Parameters.m_StrideY;
90 unsigned int xStride = data.m_Parameters.m_StrideX;
91
telsoa01c577f2c2018-08-31 09:22:23 +010092 // The world's least efficient convolution.
telsoa014fcda012018-03-09 14:13:49 +000093 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
94 {
95 for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
96 {
97 for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
98 {
99 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
100 {
telsoa01c577f2c2018-08-31 09:22:23 +0100101 // This loop goes over each output element.
telsoa014fcda012018-03-09 14:13:49 +0000102 AccumulatorType sum = AccumulatorType();
103
telsoa01c577f2c2018-08-31 09:22:23 +0100104 // For depthwise, each output channel corresponds to exactly one input channel.
105 // For normal, must loop over each input channel.
telsoa014fcda012018-03-09 14:13:49 +0000106 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
107 {
108 unsigned int depthwiseMultiplierIdx = 0;
109 if (depthwise)
110 {
111 cInput = cOutput / depthMult;
112 depthwiseMultiplierIdx = cOutput % depthMult;
113 }
114
115 for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
116 {
117 for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
118 {
telsoa01c577f2c2018-08-31 09:22:23 +0100119 // This loop goes over each input element for each output element.
telsoa014fcda012018-03-09 14:13:49 +0000120
121 unsigned int filterIndex;
122
telsoa01c577f2c2018-08-31 09:22:23 +0100123 // Since dimensionality of kernel depends on depthwiseness, so does index.
telsoa014fcda012018-03-09 14:13:49 +0000124 if (depthwise)
125 {
126 filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
127 cInput * widthFilter * heightFilter +
128 yFilter * widthFilter +
129 xFilter;
130 }
131 else
132 {
133 filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
134 cInput * widthFilter * heightFilter +
135 yFilter * widthFilter +
136 xFilter;
137 }
138 AccumulatorType filterValue = filterData[filterIndex] -
139 boost::numeric_cast<AccumulatorType>(filterOffset);
140
141 unsigned int yInput = yOutput * hStride + yFilter;
142 unsigned int xInput = xOutput * xStride + xFilter;
143
144 AccumulatorType inputValue;
145
telsoa01c577f2c2018-08-31 09:22:23 +0100146 // Check if we're in the padding.
telsoa014fcda012018-03-09 14:13:49 +0000147 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
148 xInput < paddingLeft || xInput >= widthInput + paddingLeft )
149 {
150 inputValue = AccumulatorType();
151 }
152 else
153 {
154 inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput +
155 widthInput * heightInput * cInput +
156 widthInput * (yInput - paddingTop) +
157 xInput - paddingLeft] -
158 boost::numeric_cast<AccumulatorType>(inputOffset);
159 }
160 sum += filterValue * inputValue;
161 }
162 }
163 }
164
165 if (data.m_Parameters.m_BiasEnabled)
166 {
167 sum += biasData[cOutput];
168 }
169
170 if (outputScale != 0.0f)
171 {
172 float multiplier = (inputScale * filterScale) / outputScale;
173 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
174 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
175 // sum = std::round(multiplier * sum + outputOffset);
176 sum = boost::numeric_cast<AccumulatorType>(
177 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
178 + boost::numeric_cast<AccumulatorType>(outputOffset);
179 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
180 }
181
182 outputData[batchIdx * widthOutput * heightOutput * channelsOutput +
183 widthOutput * heightOutput * cOutput +
184 widthOutput * yOutput +
185 xOutput] = boost::numeric_cast<InputType>(sum);
186 }
187 }
188 }
189 }
190}
191
192} //namespace armnn