telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 1 | // |
| 2 | // Copyright © 2017 Arm Ltd. All rights reserved. |
David Beck | ecb56cd | 2018-09-05 12:52:57 +0100 | [diff] [blame] | 3 | // SPDX-License-Identifier: MIT |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 4 | // |
| 5 | |
| 6 | #pragma once |
| 7 | |
| 8 | #include "RefWorkloadUtils.hpp" |
| 9 | |
| 10 | #include <armnn/Tensor.hpp> |
| 11 | |
| 12 | #include <boost/assert.hpp> |
| 13 | #include <boost/numeric/conversion/cast.hpp> |
| 14 | |
| 15 | #include <cmath> |
| 16 | #include <limits> |
| 17 | |
| 18 | namespace armnn |
| 19 | { |
| 20 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 21 | /// Performs multiplication of an integer with a multiplier which is less than one, |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 22 | /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor. |
| 23 | struct QuantizedMultiplierSmallerThanOne |
| 24 | { |
| 25 | public: |
| 26 | /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier. |
| 27 | /// This stores the appropriate integer quantities (derived from the given multiplier) for later use. |
| 28 | /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne(). |
| 29 | QuantizedMultiplierSmallerThanOne(float multiplier); |
| 30 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 31 | /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne(). |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 32 | int32_t operator*(int32_t rhs) const; |
| 33 | |
| 34 | private: |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 35 | /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul(). |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 36 | static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); |
| 37 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 38 | /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT(). |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 39 | static int32_t RoundingDivideByPOT(int32_t x, int exponent); |
| 40 | |
| 41 | int32_t m_Multiplier; |
| 42 | int32_t m_RightShift; |
| 43 | }; |
| 44 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 45 | /// An implementation shared by normal and depthwise convolution. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 46 | template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType> |
| 47 | static void ConvImpl(ConvData data, |
| 48 | const InputType* inputData, |
| 49 | float inputScale, |
| 50 | int32_t inputOffset, |
| 51 | const InputType* filterData, |
| 52 | float filterScale, |
| 53 | int32_t filterOffset, |
| 54 | const BiasType* biasData, |
| 55 | InputType* outputData, |
| 56 | float outputScale, |
| 57 | int32_t outputOffset, |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 58 | const TensorInfo& filterInfo, |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 59 | bool depthwise = false) |
| 60 | { |
surmeh01 | 3537c2c | 2018-05-18 16:31:43 +0100 | [diff] [blame] | 61 | if (data.m_Parameters.m_BiasEnabled && !biasData) |
| 62 | { |
| 63 | throw InvalidArgumentException("Bias is enabled but the bias data is invalid"); |
| 64 | } |
| 65 | |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 66 | const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]); |
| 67 | const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]); |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 68 | |
| 69 | unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1; |
| 70 | unsigned int channelsInput = filterInfo.GetShape()[1]; |
| 71 | unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0]; |
| 72 | |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 73 | unsigned int batchSize = outputInfo0.GetShape()[0]; |
| 74 | unsigned int heightOutput = outputInfo0.GetShape()[2]; |
| 75 | unsigned int widthOutput = outputInfo0.GetShape()[3]; |
| 76 | unsigned int heightInput = inputInfo0.GetShape()[2]; |
| 77 | unsigned int widthInput = inputInfo0.GetShape()[3]; |
| 78 | |
| 79 | unsigned int heightFilter = filterInfo.GetShape()[2]; |
| 80 | unsigned int widthFilter = filterInfo.GetShape()[3]; |
| 81 | |
| 82 | unsigned int paddingTop = data.m_Parameters.m_PadTop; |
| 83 | unsigned int paddingLeft = data.m_Parameters.m_PadLeft; |
| 84 | unsigned int hStride = data.m_Parameters.m_StrideY; |
| 85 | unsigned int xStride = data.m_Parameters.m_StrideX; |
| 86 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 87 | // The world's least efficient convolution. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 88 | for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) |
| 89 | { |
| 90 | for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++) |
| 91 | { |
| 92 | for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++) |
| 93 | { |
| 94 | for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++) |
| 95 | { |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 96 | // This loop goes over each output element. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 97 | AccumulatorType sum = AccumulatorType(); |
| 98 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 99 | // For depthwise, each output channel corresponds to exactly one input channel. |
| 100 | // For normal, must loop over each input channel. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 101 | for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++) |
| 102 | { |
| 103 | unsigned int depthwiseMultiplierIdx = 0; |
| 104 | if (depthwise) |
| 105 | { |
| 106 | cInput = cOutput / depthMult; |
| 107 | depthwiseMultiplierIdx = cOutput % depthMult; |
| 108 | } |
| 109 | |
| 110 | for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++) |
| 111 | { |
| 112 | for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++) |
| 113 | { |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 114 | // This loop goes over each input element for each output element. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 115 | |
| 116 | unsigned int filterIndex; |
| 117 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 118 | // Since dimensionality of kernel depends on depthwiseness, so does index. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 119 | if (depthwise) |
| 120 | { |
| 121 | filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput + |
| 122 | cInput * widthFilter * heightFilter + |
| 123 | yFilter * widthFilter + |
| 124 | xFilter; |
| 125 | } |
| 126 | else |
| 127 | { |
| 128 | filterIndex = cOutput * widthFilter * heightFilter * channelsInput + |
| 129 | cInput * widthFilter * heightFilter + |
| 130 | yFilter * widthFilter + |
| 131 | xFilter; |
| 132 | } |
| 133 | AccumulatorType filterValue = filterData[filterIndex] - |
| 134 | boost::numeric_cast<AccumulatorType>(filterOffset); |
| 135 | |
| 136 | unsigned int yInput = yOutput * hStride + yFilter; |
| 137 | unsigned int xInput = xOutput * xStride + xFilter; |
| 138 | |
| 139 | AccumulatorType inputValue; |
| 140 | |
telsoa01 | c577f2c | 2018-08-31 09:22:23 +0100 | [diff] [blame] | 141 | // Check if we're in the padding. |
telsoa01 | 4fcda01 | 2018-03-09 14:13:49 +0000 | [diff] [blame] | 142 | if (yInput < paddingTop || yInput >= heightInput + paddingTop || |
| 143 | xInput < paddingLeft || xInput >= widthInput + paddingLeft ) |
| 144 | { |
| 145 | inputValue = AccumulatorType(); |
| 146 | } |
| 147 | else |
| 148 | { |
| 149 | inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput + |
| 150 | widthInput * heightInput * cInput + |
| 151 | widthInput * (yInput - paddingTop) + |
| 152 | xInput - paddingLeft] - |
| 153 | boost::numeric_cast<AccumulatorType>(inputOffset); |
| 154 | } |
| 155 | sum += filterValue * inputValue; |
| 156 | } |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | if (data.m_Parameters.m_BiasEnabled) |
| 161 | { |
| 162 | sum += biasData[cOutput]; |
| 163 | } |
| 164 | |
| 165 | if (outputScale != 0.0f) |
| 166 | { |
| 167 | float multiplier = (inputScale * filterScale) / outputScale; |
| 168 | // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent |
| 169 | // with the AndroidNN CPU implementation. This should be (roughly) equivalent to: |
| 170 | // sum = std::round(multiplier * sum + outputOffset); |
| 171 | sum = boost::numeric_cast<AccumulatorType>( |
| 172 | QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum)) |
| 173 | + boost::numeric_cast<AccumulatorType>(outputOffset); |
| 174 | sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255); |
| 175 | } |
| 176 | |
| 177 | outputData[batchIdx * widthOutput * heightOutput * channelsOutput + |
| 178 | widthOutput * heightOutput * cOutput + |
| 179 | widthOutput * yOutput + |
| 180 | xOutput] = boost::numeric_cast<InputType>(sum); |
| 181 | } |
| 182 | } |
| 183 | } |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | } //namespace armnn |