blob: 9d25379930a9fa6d24ae14e352b5ef941f802a1a [file] [log] [blame]
//
// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
#include "Optimization.hpp"
#include <armnnUtils/DataLayoutIndexed.hpp>
#include <ResolveType.hpp>
namespace armnn
{
namespace optimizations
{
template <typename ConvLayer, armnn::DataType ArmnnType,
typename T = armnn::ResolveType<ArmnnType>>
class FuseBatchNorm
{
public:
/// Run for every exclusive connection between any base Convolution layer and a child BatchNorm layer for not
/// quantized layers.
/// The child will be removed, the base will be removed if it's left unconnected. A new Convolution layer will
/// be added, its weights and bias will be calculated using the weights and bias of the base Convolution layer
/// combined with the parameters of the child BatchNorm layer.
void Run(Graph& graph, InputSlot& connection) const
{
Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
Layer& child = connection.GetOwningLayer();
bool depthwise = (base.GetType() == LayerType::DepthwiseConvolution2d);
ARMNN_ASSERT(base.GetType() == LayerType::Convolution2d || depthwise);
ARMNN_ASSERT(child.GetType() == LayerType::BatchNormalization);
if (base.GetDataType() == ArmnnType && child.GetDataType() == ArmnnType)
{
OutputSlot* parentOut = base.GetInputSlot(0).GetConnectedOutputSlot();
auto convLayer = PolymorphicDowncast<ConvLayer*>(&base);
auto batchNormLayer = PolymorphicDowncast<BatchNormalizationLayer*>(&child);
// Read convolution and batch norm parameters
BatchNormalizationDescriptor batchNormDescriptor = batchNormLayer->GetParameters();
auto epsilon = batchNormDescriptor.m_Eps;
IgnoreUnused(epsilon);
ConstTensor betaTensor(batchNormLayer->m_Beta->GetTensorInfo(), batchNormLayer->m_Beta->Map(true));
ConstTensor gammaTensor(batchNormLayer->m_Gamma->GetTensorInfo(), batchNormLayer->m_Gamma->Map(true));
ConstTensor meanTensor(batchNormLayer->m_Mean->GetTensorInfo(), batchNormLayer->m_Mean->Map(true));
ConstTensor varTensor(batchNormLayer->m_Variance->GetTensorInfo(), batchNormLayer->m_Variance->Map(true));
auto convDescriptor = convLayer->GetParameters();
auto weightsInfo(convLayer->m_Weight->GetTensorInfo());
ConstTensor weightsTensor(weightsInfo, convLayer->m_Weight->Map(true));
armnnUtils::DataLayoutIndexed dataLayout(convDescriptor.m_DataLayout);
auto weightsShape = weightsInfo.GetShape();
const unsigned int depthMultiplier = depthwise ? weightsShape[0] : 1;
const unsigned int inputChannels = depthwise ? weightsShape[1] :
weightsShape[dataLayout.GetChannelsIndex()];
const unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : weightsShape[0];
const unsigned int weightsHeight = depthwise ? weightsShape[2] :
weightsShape[dataLayout.GetHeightIndex()];
const unsigned int weightsWidth = depthwise ? weightsShape[3] :
weightsShape[dataLayout.GetWidthIndex()];
const auto* weightsBuffer = static_cast<const T*>(weightsTensor.GetMemoryArea());
const auto* betaBuffer = static_cast<const T*>(betaTensor.GetMemoryArea());
const auto* gammaBuffer = static_cast<const T*>(gammaTensor.GetMemoryArea());
const auto* meanBuffer = static_cast<const T*>(meanTensor.GetMemoryArea());
const auto* varBuffer = static_cast<const T*>(varTensor.GetMemoryArea());
std::vector<T> weightsVector (weightsBuffer, weightsBuffer + weightsTensor.GetNumElements());
std::vector<T> betaVector (betaBuffer, betaBuffer + betaTensor.GetNumElements());
std::vector<T> gammaVector (gammaBuffer, gammaBuffer + gammaTensor.GetNumElements());
std::vector<T> meanVector (meanBuffer, meanBuffer + meanTensor.GetNumElements());
std::vector<T> varianceVector(varBuffer, varBuffer + varTensor.GetNumElements());
// fusedWeights = ( gamma * weights ) / ( std - epsilon);
std::vector<T> fusedWeightsVector(weightsVector.size());
unsigned int depthwiseMultiplierIdx = 0;
for (unsigned int cInput = 0; cInput < inputChannels; ++cInput)
{
for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
{
T mult = gammaVector[cOut] / static_cast<T>(sqrtf (varianceVector[cOut] + epsilon));
if (depthwise)
{
cInput = cOut / depthMultiplier;
depthwiseMultiplierIdx = cOut % depthMultiplier;
}
for (unsigned int h = 0; h < weightsHeight; ++h)
{
for (unsigned int w = 0; w < weightsWidth; ++w)
{
unsigned int weightsIdx = 0;
if (depthwise)
{
weightsIdx = depthwiseMultiplierIdx * weightsWidth * weightsHeight * inputChannels +
cInput * weightsWidth * weightsHeight +
h * weightsWidth +
w;
}
else if (convDescriptor.m_DataLayout == DataLayout::NHWC)
{
weightsIdx = cOut * weightsHeight * weightsWidth * inputChannels +
h * weightsWidth * inputChannels +
w * inputChannels +
cInput;
}
else
{
weightsIdx = cOut * weightsWidth * weightsHeight * inputChannels +
cInput * weightsWidth * weightsHeight +
h * weightsWidth +
w;
}
fusedWeightsVector[weightsIdx] = mult * weightsVector[weightsIdx];
}
}
}
}
ConstTensor fusedWeightsTensor(weightsInfo, fusedWeightsVector);
// fusedBias = (gamma * (bias - mean)) / (variance - epsilon) + beta;
std::vector<T> fusedBiasVector(outputChannels);
if (convDescriptor.m_BiasEnabled)
{
ARMNN_ASSERT_MSG(convLayer->m_Bias != nullptr,
"FuseBatchNorm: Bias data should not be null if bias is enabled.");
ConstTensor biasTensor(convLayer->m_Bias->GetTensorInfo(), convLayer->m_Bias->Map(true));
const auto* biasBuffer = static_cast<const T*>(biasTensor.GetMemoryArea());
std::vector<T> biasVector(biasBuffer, biasBuffer + biasTensor.GetNumElements());
for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
{
fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /
sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];
}
}
else
{
convDescriptor.m_BiasEnabled = true;
std::vector<T> biasVector(outputChannels, T(0));
for (unsigned int cOut = 0; cOut < outputChannels; ++cOut)
{
fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) /
sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut];
}
}
ConstTensor fusedBiasTensor(TensorInfo({outputChannels}, ArmnnType), fusedBiasVector);
// Insert the new convolution layer that has batch norm parameters fused into
const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + base.GetName();
auto& newConv2dLayer = *graph.InsertNewLayer<ConvLayer>(base.GetInputSlot(0),
convDescriptor,
name.c_str());
newConv2dLayer.m_Weight = std::make_unique<ScopedCpuTensorHandle>(fusedWeightsTensor);
newConv2dLayer.m_Bias = std::make_unique<ScopedCpuTensorHandle>(ConstTensor(fusedBiasTensor));
// Reconnects with original parent.
newConv2dLayer.GetOutputSlot().MoveAllConnections(*parentOut);
// Parent is now the new convolution2d layer.
parentOut = &newConv2dLayer.GetOutputSlot();
// Moves connections in child output to parent layer.
// Child layer will be removed as it's left unconnected.
// Base layer will be removed if left unconnected.
child.GetOutputSlot().MoveAllConnections(*parentOut);
}
}
protected:
FuseBatchNorm() = default;
~FuseBatchNorm() = default;
};
using FuseBatchNormIntoConvolution2DFloat32 =
OptimizeForExclusiveConnection<Convolution2dLayer,
BatchNormalizationLayer,
FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float32>>;
using FuseBatchNormIntoConvolution2DFloat16 =
OptimizeForExclusiveConnection<Convolution2dLayer,
BatchNormalizationLayer,
FuseBatchNorm<Convolution2dLayer, armnn::DataType::Float16>>;
using FuseBatchNormIntoDepthwiseConvolution2DFloat32 =
OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,
BatchNormalizationLayer,
FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float32>>;
using FuseBatchNormIntoDepthwiseConvolution2DFloat16 =
OptimizeForExclusiveConnection<DepthwiseConvolution2dLayer,
BatchNormalizationLayer,
FuseBatchNorm<DepthwiseConvolution2dLayer, armnn::DataType::Float16>>;
} // namespace optimizations
} // namespace armnn