src/backends/neon/NeonBackendOptimizationUtils.hpp - ml/armnn - Gitiles

 //
 // Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #pragma once

 #include <aclCommon/ArmComputeSubgraphUtils.hpp>

 namespace armnn
 {

 // Changes shapes of the form [1, 1, ..., W] to [ W ]
 inline bool CollapseLeadingUnitDimensions(const TensorInfo& in, TensorInfo& out)
 {
     unsigned int numDimensions = in.GetNumDimensions();
     for (unsigned int i = 0; i < (numDimensions-1); ++i)
     {
         if (in.GetShape()[i] != 1)
         {
             return false;
         }
     }

     unsigned int w = in.GetShape()[numDimensions-1];
     out = in;
     out.SetShape({w});

     return true;
 }

 //
 // Build slot and tensor info lists for Add/Mul/Add replacement
 //
 template<typename SlotListType>
 void BuildAddMulAddSlotLists(bool handleReLu,
                              bool multipleOutputs,
                              std::vector<SlotListType>& inputLayersSlotLists,
                              std::vector<SlotListType>& outputLayersSlotLists)
 {
     // Build input slot list
     inputLayersSlotLists.push_back({0, 1});     // Add
     inputLayersSlotLists.push_back({1});        // Mul
     inputLayersSlotLists.push_back({1});        // Add
     if (handleReLu)
     {
         inputLayersSlotLists.push_back({});     // Relu
     }

     // Build output slot list
     if (multipleOutputs)
     {
         outputLayersSlotLists.push_back({0});   // Add
     }
     else
     {
         outputLayersSlotLists.push_back({});    // Add
     }
     outputLayersSlotLists.push_back({});        // Mul
     if (handleReLu)
     {
         outputLayersSlotLists.push_back({});    // Add
         outputLayersSlotLists.push_back({0});   // Relu
     }
     else
     {
         outputLayersSlotLists.push_back({0});   // Add
     }
 }

 inline void GetFusedName(Layer *layerList[4], std::string& fusedName)
 {
     // Build the fused name string
     fusedName = "fused";
     for (unsigned int layerIdx = 0; layerIdx< 4; ++layerIdx)
     {
         if (! layerList[layerIdx])
         {
             break;
         }
         fusedName += "-";
         fusedName += layerList[layerIdx]->GetNameStr();
     }
 }

 template<typename Type>
 bool BuildAddMulAddTensorInfoLists(Type* layerList[4],
                                    unsigned int& numInputs,
                                    unsigned int& numOutputs,
                                    std::vector<TensorInfo>& inputInfos,
                                    std::vector<TensorInfo>& outputInfos,
                                    const ActivationDescriptor*& activationDescriptor,
                                    bool& fuseReLu)
 {
     ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[0]);
     ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[1]);
     ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[2]);

     ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[0], BinaryOperation::Add));
     ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[1], BinaryOperation::Mul));
     ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[2], BinaryOperation::Add));

     auto is1D = [](const TensorInfo expanded)
     {
         TensorInfo collapsed;
         if (CollapseLeadingUnitDimensions(expanded, collapsed))
         {
             return (collapsed.GetNumDimensions() == 1);
         }
         else
         {
             return (expanded.GetNumDimensions() == 1);
         }
     };

     // One of the 2 inputs for MUL and the Second ADD must be 1D
     // ref: clframework/src/cpu/kernels/CpuAddMulAddKernel.cpp
     auto& mulLayer  = *(PolymorphicDowncast<ElementwiseBinaryLayer*>(layerList[1]));
     auto& add2Layer = *(PolymorphicDowncast<ElementwiseBinaryLayer*>(layerList[2]));

     Layer& mulInput0  = mulLayer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer();
     Layer& mulInput1  = mulLayer.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer();
     Layer& add2Input0 = add2Layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer();
     Layer& add2Input1 = add2Layer.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer();
     if (!is1D(mulInput0.GetOutputSlot(0).GetTensorInfo()) && !is1D(mulInput1.GetOutputSlot(0).GetTensorInfo()))
     {
         return false;
     }
     if (!is1D(add2Input0.GetOutputSlot(0).GetTensorInfo()) && !is1D(add2Input1.GetOutputSlot(0).GetTensorInfo()))
     {
         return false;
     }

     fuseReLu = (layerList[3] != nullptr);
     if (fuseReLu)
     {
         activationDescriptor = &PolymorphicDowncast<ActivationLayer *>(layerList[3])->GetParameters();
         ARMNN_THROW_INVALIDARG_IF_FALSE((activationDescriptor->m_Function == ActivationFunction::ReLu) ||
                      (activationDescriptor->m_Function == ActivationFunction::BoundedReLu));
     }

     numInputs = 0;
     numOutputs = 0;

     // Ensure that there are 6 input slots in the add/mul/add layers
     // we are going to replace
     unsigned int layerIdx = 0;
     unsigned int inputSlotCount = 0;
     for (layerIdx = 0; layerIdx < 3; ++layerIdx)
     {
         for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumInputSlots(); ++slotIdx)
         {
             InputSlot* inputSlot = &layerList[layerIdx]->GetInputSlot(slotIdx);
             OutputSlot* outputSlot = inputSlot->GetConnectedOutputSlot();
             if (outputSlot)
             {
                 if (layerIdx == 0)
                 {
                     // Always count the input connections of the first add
                     inputInfos.push_back(inputSlot->GetTensorInfo());
                     numInputs++;
                 }
                 else
                 {
                     // For subsequent layers, we skip connections to the previous layers in the counting
                     if (&outputSlot->GetOwningLayer() != layerList[layerIdx-1])
                     {
                         TensorInfo inputSlotInfo = inputSlot->GetTensorInfo();
                         if (numInputs == 2 || numInputs == 3)
                         {
                             // Workaround the broadcast optimization to collapse shapes such as
                             // [1, 1, 1, 2] to [2] as required by backend
                             if (CollapseLeadingUnitDimensions(inputSlot->GetTensorInfo(), inputSlotInfo))
                             {
                                 OutputSlot* previousLayerSlot = inputSlot->GetConnectedOutputSlot();
                                 if (previousLayerSlot)
                                 {
                                     if (previousLayerSlot->GetOwningLayer().GetType() == LayerType::Constant)
                                     {
                                         // First update the TensorInfo in the constant owning layer
                                         previousLayerSlot->SetTensorInfo(inputSlotInfo);
                                         // Then update the TensorInfo in the workload for the owning layer
                                         ConstantLayer* layer = PolymorphicDowncast<ConstantLayer*>(
                                                 &previousLayerSlot->GetOwningLayer());
                                         layer->m_LayerOutput
                                                 = std::make_unique<ScopedTensorHandle>(
                                                 ConstTensor(inputSlotInfo,
                                                             layer->m_LayerOutput.get()->GetConstTensor<void>()));
                                     }
                                 }
                             }
                         }
                         inputInfos.push_back(inputSlotInfo);
                         numInputs++;
                     }
                 }
                 inputSlotCount++;
             }
         }
     }

     // Check the input counts
     bool validInputCount = (inputSlotCount == 6) && (inputInfos.size() == 4);
     if (! validInputCount)
     {
         return false;
     }

     const unsigned int maxIdx = (fuseReLu) ? 4 : 3;
     for (layerIdx = 0; layerIdx < maxIdx; ++layerIdx)
     {
         for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumOutputSlots(); ++slotIdx)
         {
             OutputSlot* outputSlot = &layerList[layerIdx]->GetOutputSlot(slotIdx);

             for (unsigned int connectionIdx = 0; connectionIdx < outputSlot->GetNumConnections(); ++connectionIdx)
             {
                 InputSlot* inputSlot = outputSlot->GetConnection(connectionIdx);
                 if (layerIdx < (maxIdx-1))
                 {
                     if (&inputSlot->GetOwningLayer() != layerList[layerIdx+1])
                     {
                         outputInfos.push_back(outputSlot->GetTensorInfo());
                         numOutputs++;
                     }
                 }
                 else if (layerList[layerIdx] != nullptr)
                 {
                     outputInfos.push_back(outputSlot->GetTensorInfo());
                     numOutputs++;
                 }
             }
         }
     }

     // Check the output count
     bool validOutputCount = (outputInfos.size() > 0);
     if (! validOutputCount)
     {
         return false;
     }

     return true;
 }

 }
	//
	// Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#pragma once

	#include <aclCommon/ArmComputeSubgraphUtils.hpp>

	namespace armnn
	{

	// Changes shapes of the form [1, 1, ..., W] to [ W ]
	inline bool CollapseLeadingUnitDimensions(const TensorInfo& in, TensorInfo& out)
	{
	unsigned int numDimensions = in.GetNumDimensions();
	for (unsigned int i = 0; i < (numDimensions-1); ++i)
	{
	if (in.GetShape()[i] != 1)
	{
	return false;
	}
	}

	unsigned int w = in.GetShape()[numDimensions-1];
	out = in;
	out.SetShape({w});

	return true;
	}

	//
	// Build slot and tensor info lists for Add/Mul/Add replacement
	//
	template<typename SlotListType>
	void BuildAddMulAddSlotLists(bool handleReLu,
	bool multipleOutputs,
	std::vector<SlotListType>& inputLayersSlotLists,
	std::vector<SlotListType>& outputLayersSlotLists)
	{
	// Build input slot list
	inputLayersSlotLists.push_back({0, 1}); // Add
	inputLayersSlotLists.push_back({1}); // Mul
	inputLayersSlotLists.push_back({1}); // Add
	if (handleReLu)
	{
	inputLayersSlotLists.push_back({}); // Relu
	}

	// Build output slot list
	if (multipleOutputs)
	{
	outputLayersSlotLists.push_back({0}); // Add
	}
	else
	{
	outputLayersSlotLists.push_back({}); // Add
	}
	outputLayersSlotLists.push_back({}); // Mul
	if (handleReLu)
	{
	outputLayersSlotLists.push_back({}); // Add
	outputLayersSlotLists.push_back({0}); // Relu
	}
	else
	{
	outputLayersSlotLists.push_back({0}); // Add
	}
	}

	inline void GetFusedName(Layer *layerList[4], std::string& fusedName)
	{
	// Build the fused name string
	fusedName = "fused";
	for (unsigned int layerIdx = 0; layerIdx< 4; ++layerIdx)
	{
	if (! layerList[layerIdx])
	{
	break;
	}
	fusedName += "-";
	fusedName += layerList[layerIdx]->GetNameStr();
	}
	}

	template<typename Type>
	bool BuildAddMulAddTensorInfoLists(Type* layerList[4],
	unsigned int& numInputs,
	unsigned int& numOutputs,
	std::vector<TensorInfo>& inputInfos,
	std::vector<TensorInfo>& outputInfos,
	const ActivationDescriptor*& activationDescriptor,
	bool& fuseReLu)
	{
	ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[0]);
	ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[1]);
	ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[2]);

	ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[0], BinaryOperation::Add));
	ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[1], BinaryOperation::Mul));
	ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[2], BinaryOperation::Add));

	auto is1D = [](const TensorInfo expanded)
	{
	TensorInfo collapsed;
	if (CollapseLeadingUnitDimensions(expanded, collapsed))
	{
	return (collapsed.GetNumDimensions() == 1);
	}
	else
	{
	return (expanded.GetNumDimensions() == 1);
	}
	};

	// One of the 2 inputs for MUL and the Second ADD must be 1D
	// ref: clframework/src/cpu/kernels/CpuAddMulAddKernel.cpp
	auto& mulLayer = (PolymorphicDowncast<ElementwiseBinaryLayer>(layerList[1]));
	auto& add2Layer = (PolymorphicDowncast<ElementwiseBinaryLayer>(layerList[2]));

	Layer& mulInput0 = mulLayer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer();
	Layer& mulInput1 = mulLayer.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer();
	Layer& add2Input0 = add2Layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer();
	Layer& add2Input1 = add2Layer.GetInputSlot(1).GetConnectedOutputSlot()->GetOwningLayer();
	if (!is1D(mulInput0.GetOutputSlot(0).GetTensorInfo()) && !is1D(mulInput1.GetOutputSlot(0).GetTensorInfo()))
	{
	return false;
	}
	if (!is1D(add2Input0.GetOutputSlot(0).GetTensorInfo()) && !is1D(add2Input1.GetOutputSlot(0).GetTensorInfo()))
	{
	return false;
	}

	fuseReLu = (layerList[3] != nullptr);
	if (fuseReLu)
	{
	activationDescriptor = &PolymorphicDowncast<ActivationLayer *>(layerList[3])->GetParameters();
	ARMNN_THROW_INVALIDARG_IF_FALSE((activationDescriptor->m_Function == ActivationFunction::ReLu) \|\|
	(activationDescriptor->m_Function == ActivationFunction::BoundedReLu));
	}

	numInputs = 0;
	numOutputs = 0;

	// Ensure that there are 6 input slots in the add/mul/add layers
	// we are going to replace
	unsigned int layerIdx = 0;
	unsigned int inputSlotCount = 0;
	for (layerIdx = 0; layerIdx < 3; ++layerIdx)
	{
	for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumInputSlots(); ++slotIdx)
	{
	InputSlot* inputSlot = &layerList[layerIdx]->GetInputSlot(slotIdx);
	OutputSlot* outputSlot = inputSlot->GetConnectedOutputSlot();
	if (outputSlot)
	{
	if (layerIdx == 0)
	{
	// Always count the input connections of the first add
	inputInfos.push_back(inputSlot->GetTensorInfo());
	numInputs++;
	}
	else
	{
	// For subsequent layers, we skip connections to the previous layers in the counting
	if (&outputSlot->GetOwningLayer() != layerList[layerIdx-1])
	{
	TensorInfo inputSlotInfo = inputSlot->GetTensorInfo();
	if (numInputs == 2 \|\| numInputs == 3)
	{
	// Workaround the broadcast optimization to collapse shapes such as
	// [1, 1, 1, 2] to [2] as required by backend
	if (CollapseLeadingUnitDimensions(inputSlot->GetTensorInfo(), inputSlotInfo))
	{
	OutputSlot* previousLayerSlot = inputSlot->GetConnectedOutputSlot();
	if (previousLayerSlot)
	{
	if (previousLayerSlot->GetOwningLayer().GetType() == LayerType::Constant)
	{
	// First update the TensorInfo in the constant owning layer
	previousLayerSlot->SetTensorInfo(inputSlotInfo);
	// Then update the TensorInfo in the workload for the owning layer
	ConstantLayer* layer = PolymorphicDowncast<ConstantLayer*>(
	&previousLayerSlot->GetOwningLayer());
	layer->m_LayerOutput
	= std::make_unique<ScopedTensorHandle>(
	ConstTensor(inputSlotInfo,
	layer->m_LayerOutput.get()->GetConstTensor<void>()));
	}
	}
	}
	}
	inputInfos.push_back(inputSlotInfo);
	numInputs++;
	}
	}
	inputSlotCount++;
	}
	}
	}

	// Check the input counts
	bool validInputCount = (inputSlotCount == 6) && (inputInfos.size() == 4);
	if (! validInputCount)
	{
	return false;
	}

	const unsigned int maxIdx = (fuseReLu) ? 4 : 3;
	for (layerIdx = 0; layerIdx < maxIdx; ++layerIdx)
	{
	for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumOutputSlots(); ++slotIdx)
	{
	OutputSlot* outputSlot = &layerList[layerIdx]->GetOutputSlot(slotIdx);

	for (unsigned int connectionIdx = 0; connectionIdx < outputSlot->GetNumConnections(); ++connectionIdx)
	{
	InputSlot* inputSlot = outputSlot->GetConnection(connectionIdx);
	if (layerIdx < (maxIdx-1))
	{
	if (&inputSlot->GetOwningLayer() != layerList[layerIdx+1])
	{
	outputInfos.push_back(outputSlot->GetTensorInfo());
	numOutputs++;
	}
	}
	else if (layerList[layerIdx] != nullptr)
	{
	outputInfos.push_back(outputSlot->GetTensorInfo());
	numOutputs++;
	}
	}
	}
	}

	// Check the output count
	bool validOutputCount = (outputInfos.size() > 0);
	if (! validOutputCount)
	{
	return false;
	}

	return true;
	}

	}