src/backends/backendsCommon/WorkloadUtils.hpp - ml/armnn - Gitiles

 //
 // Copyright © 2017-2024 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #pragma once

 #include <armnn/backends/ITensorHandle.hpp>
 #include <armnn/backends/TensorHandle.hpp>
 #include <armnn/Tensor.hpp>
 #include <armnn/utility/PolymorphicDowncast.hpp>
 #include <armnnUtils/Permute.hpp>

 #include <Half.hpp>
 #include <Profiling.hpp>


 namespace armnn
 {
 namespace
 {

 template <typename ArrayType, typename Arg>
 void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
 {
     if (idx >= num)
     {
         return;
     }

     arg = array[(num - 1) - idx];
     idx++;
 }

 template <typename T, typename ArrayType, typename... Args>
 void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args&... args)
 {
     AssignValues(num, idx, array, assignee);

     AssignValues(num, idx, array, args...);
 }

 }    // anonymous namespace

 template <typename CopyFunc>
 void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
 {
     // For ease of understanding, names are assigned to the dimensions
     // of the tensor as if NHWC, however this routine works with any 5D tensor
     static_assert(MaxNumOfTensorDimensions == 5, "Please update CopyTensorContents");

     TensorShape srcStrides      = srcTensor->GetStrides();
     const TensorShape& srcShape = srcTensor->GetShape();
     const auto srcSize          = srcTensor->GetStrides()[0] * srcShape[0];
     TensorShape dstStrides      = dstTensor->GetStrides();
     const TensorShape& dstShape = dstTensor->GetShape();
     const auto dstSize          = dstTensor->GetStrides()[0] * dstShape[0];

     size_t srcDepth    = 1;
     size_t srcBatches  = 1;
     size_t srcHeight   = 1;
     size_t srcWidth    = 1;
     size_t srcChannels = 1;
     AssignValues(srcShape.GetNumDimensions(),
                  0,
                  srcShape,
                  srcChannels,
                  srcWidth,
                  srcHeight,
                  srcBatches,
                  srcDepth);

     size_t srcDepthStride   = 0;
     size_t srcBatchStride   = 0;
     size_t srcHeightStride  = 0;
     size_t srcWidthStride   = 0;
     size_t srcChannelStride = 0;
     AssignValues(srcStrides.GetNumDimensions(),
                  0,
                  srcStrides,
                  srcChannelStride,
                  srcWidthStride,
                  srcHeightStride,
                  srcBatchStride,
                  srcDepthStride);

     size_t dstDepth    = 1;
     size_t dstBatches  = 1;
     size_t dstHeight   = 1;
     size_t dstWidth    = 1;
     size_t dstChannels = 1;
     AssignValues(dstShape.GetNumDimensions(),
                  0,
                  dstShape,
                  dstChannels,
                  dstWidth,
                  dstHeight,
                  dstBatches,
                  dstDepth);

     size_t dstDepthStride   = 0;
     size_t dstBatchStride   = 0;
     size_t dstHeightStride  = 0;
     size_t dstWidthStride   = 0;
     size_t dstChannelStride = 0;
     AssignValues(dstStrides.GetNumDimensions(),
                  0,
                  dstStrides,
                  dstChannelStride,
                  dstWidthStride,
                  dstHeightStride,
                  dstBatchStride,
                  dstDepthStride);

     const unsigned char* srcDataStart;
     unsigned char* dstDataStart;
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Synchronize buffers");
         srcDataStart = static_cast<const uint8_t*>(srcTensor->Map());
         dstDataStart = static_cast<uint8_t*>(dstTensor->Map());
     }
     if (srcDataStart == nullptr)
     {
         throw MemoryValidationException("The source tensor is null.");
     }
     if (dstDataStart == nullptr)
     {
         throw MemoryValidationException("The destination tensor is null.");
     }

     size_t copyLength  = std::min(srcChannels * srcChannelStride, dstChannels * dstChannelStride);
     size_t copyWidth   = std::min(srcWidth, dstWidth);
     size_t copyHeight  = std::min(srcHeight, dstHeight);
     size_t copyBatches = std::min(srcBatches, dstBatches);
     size_t copyDepth   = std::min(srcDepth, dstDepth);

     // Coalesce inner dimensions where possible
     // to reduce overheard calling copy() and to
     // allow for memory bandwidth optimisations
     if (copyLength == srcWidthStride &&
         copyLength == dstWidthStride)
     {
         // There is no special padding between rows,
         // and sizes are compatible, so copy whole rows
         copyLength *= copyWidth;
         copyWidth = 1;

         if (copyLength == srcHeightStride &&
             copyLength == dstHeightStride)
         {
             // There is no special padding between batches
             // and sizes are compatible so copy whole batches
             copyLength *= copyHeight;
             copyHeight = 1;
         }
     }

     const unsigned char* srcData = srcDataStart;
     unsigned char* dstData = dstDataStart;
     for (unsigned int d = 0; d < copyDepth; ++d)
     {
         auto srcPtrDepth = srcData;
         auto dstPtrDepth = dstData;
         for (unsigned int b = 0; b < copyBatches; ++b)
         {
             auto srcPtrBatch = srcData;
             auto dstPtrBatch = dstData;
             for (unsigned int h = 0; h < copyHeight; ++h)
             {
                 auto srcPtrChannel = srcData;
                 auto dstPtrChannel = dstData;
                 for (unsigned int w = 0; w < copyWidth; ++w)
                 {
                     // Sanity check the memory area we've been asked to copy from and to.
                     if (copyLength > srcSize)
                     {
                         throw MemoryValidationException(
                             "The source tensor size does not match the size of the allocated tensor.");
                     }
                     if (copyLength > dstSize)
                     {
                         throw MemoryValidationException(
                             "The destination tensor size will overrun the destination tensor.");
                     }
                     copy(dstData, srcData, copyLength);
                     dstData += dstWidthStride;
                     srcData += srcWidthStride;
                 }
                 dstData += (static_cast<long>(dstHeightStride) - (dstData - dstPtrChannel));
                 srcData += (static_cast<long>(srcHeightStride) - (srcData - srcPtrChannel));
             }
             dstData += (static_cast<long>(dstBatchStride) - (dstData - dstPtrBatch));
             srcData += (static_cast<long>(srcBatchStride) - (srcData - srcPtrBatch));
         }
         dstData += (static_cast<long>(dstDepthStride) - (dstData - dstPtrDepth));
         srcData += (static_cast<long>(srcDepthStride) - (srcData - srcPtrDepth));
     }

     srcTensor->Unmap();
     dstTensor->Unmap();
 }

 template <typename SrcTensorHandleType, typename DstTensorHandleType, typename DescriptorType>
 void GatherTensorHandlePairs(const DescriptorType& descriptor,
                              std::vector<std::pair<SrcTensorHandleType*, DstTensorHandleType*>>& tensorHandlePairs)
 {
     const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size());
     tensorHandlePairs.reserve(numInputs);

     for (unsigned int i = 0; i < numInputs; ++i)
     {
         SrcTensorHandleType* const srcTensorHandle =
             PolymorphicDowncast<SrcTensorHandleType*>(descriptor.m_Inputs[i]);
         DstTensorHandleType* const dstTensorHandle =
             PolymorphicDowncast<DstTensorHandleType*>(descriptor.m_Outputs[i]);

         tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle);
     }
 }

 int32_t ConvertMaskToACLFormat(int32_t mask, int32_t numDim);

 armnn::ConstTensor PermuteTensor(const ConstTensorHandle* tensor,
                                  const PermutationVector& permutationVector,
                                  void* permuteBuffer);

 void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout);

 TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout);

 /// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M]
 /// This function coverts a TensorInfo from [1,H,W,I*M] to [1,I*M,H,W] (if NCHW) or keeps it at [1,H,W,I*M] (if NHWC)
 /// as required by the compute library
 /// Returns a tuple of converted weights tensor info and depth multiplier
 std::tuple<TensorInfo, unsigned int> Convert1HWOTensorInfoToAcl(const TensorInfo& weightInfo,
                                                                 const TensorInfo& inputInfo,
                                                                 const DataLayout dataLayout);

 armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstTensorHandle* weightTensor,
                                                      DataLayout dataLayout,
                                                      void* permuteBuffer);

 /// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M]
 /// This function coverts a ConstCpuTensorHandle from [1,H,W,I*M] to [1,I*M,H,W] (if NCHW) or
 /// keeps it at [1,H,W,I*M] (if NHWC) as required by the compute library
 ///
 /// \param weightTensor - ConstTensorHandle of weights tensor
 /// \param inputInfo - TensorInfo of input tensor
 /// \param dataLayout - DataLayout of the input tensor
 /// \param permuteBuffer - Pointer to memory with the size of tensor. Used for the permutation
 /// \return tuple of transformed weights-ConstTensor and depthwise multiplier
 std::tuple<ConstTensor, unsigned int> Convert1HWOTensorToAcl(const ConstTensorHandle* weightTensor,
                                                              const TensorInfo& inputInfo,
                                                              const DataLayout dataLayout,
                                                              void* permuteBuffer);

 /// Converts a (weights) tensor from [1, H, W, I*M] = [1, H, W, O] to [M, I, H, W]
 ///
 /// \param weightTensor - ConstTensorHandle of the weight tensor that should be converted
 /// \param inputInfo - TensorInfo of the corresponding input tensor
 /// \param dataLayout - DataLayout of the input tensor e.g. NHWC or NCHW
 /// \param permuteBuffer - Memory location with the same size as the weight tensor to write converted data to
 /// \return - A tuple of ConstTensor and unsigned int which is the converted weightTensor and the depthMultiplier
 std::tuple<ConstTensor, unsigned int> Convert1HWOtoMIHW(const ConstTensorHandle* weightTensor,
                                                         const TensorInfo& inputInfo,
                                                         const DataLayout& dataLayout,
                                                         void* permuteBuffer);

 /// Calculates the key index values needed for GatherNd: N, ND, K, W, C (N is always 1)
 ///
 /// \param inputInfo0 - TensorInfo of the corresponding input tensor: params
 /// \param inputInfo1 - TensorInfo of the corresponding input tensor: indices
 /// \return - A map with names and values for  N, ND, K, W, C
 std::map<std::string, unsigned int> CalculateGatherNdKeyIndices(TensorInfo inputInfo0, TensorInfo inputInfo1);

 /// Generates a permutation vector of size rank that permutes the 2 most right dimensions
 ///
 /// \param rank - Tensor rank, i.e. number of dimensions in the tensors
 /// \return - A permutation vector that permutes the 2 last dimensions
 armnn::PermutationVector GeneratePermutationVectorOnLastTwoDimensions(unsigned int rank);

 /// Calculates the axis values for split operation.
 ///
 /// \param desc - Splitter Descriptor
 /// \param input - Input tensor shape
 /// \return - A set containing axis values of slitter operation
     std::set<unsigned int> ComputeSplitAxis(const armnn::SplitterDescriptor& desc, const TensorShape& input);

 }  //namespace armnn
	//
	// Copyright © 2017-2024 Arm Ltd. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#pragma once

	#include <armnn/backends/ITensorHandle.hpp>
	#include <armnn/backends/TensorHandle.hpp>
	#include <armnn/Tensor.hpp>
	#include <armnn/utility/PolymorphicDowncast.hpp>
	#include <armnnUtils/Permute.hpp>

	#include <Half.hpp>
	#include <Profiling.hpp>


	namespace armnn
	{
	namespace
	{

	template <typename ArrayType, typename Arg>
	void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
	{
	if (idx >= num)
	{
	return;
	}

	arg = array[(num - 1) - idx];
	idx++;
	}

	template <typename T, typename ArrayType, typename... Args>
	void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args&... args)
	{
	AssignValues(num, idx, array, assignee);

	AssignValues(num, idx, array, args...);
	}

	} // anonymous namespace

	template <typename CopyFunc>
	void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
	{
	// For ease of understanding, names are assigned to the dimensions
	// of the tensor as if NHWC, however this routine works with any 5D tensor
	static_assert(MaxNumOfTensorDimensions == 5, "Please update CopyTensorContents");

	TensorShape srcStrides = srcTensor->GetStrides();
	const TensorShape& srcShape = srcTensor->GetShape();
	const auto srcSize = srcTensor->GetStrides()[0] * srcShape[0];
	TensorShape dstStrides = dstTensor->GetStrides();
	const TensorShape& dstShape = dstTensor->GetShape();
	const auto dstSize = dstTensor->GetStrides()[0] * dstShape[0];

	size_t srcDepth = 1;
	size_t srcBatches = 1;
	size_t srcHeight = 1;
	size_t srcWidth = 1;
	size_t srcChannels = 1;
	AssignValues(srcShape.GetNumDimensions(),
	0,
	srcShape,
	srcChannels,
	srcWidth,
	srcHeight,
	srcBatches,
	srcDepth);

	size_t srcDepthStride = 0;
	size_t srcBatchStride = 0;
	size_t srcHeightStride = 0;
	size_t srcWidthStride = 0;
	size_t srcChannelStride = 0;
	AssignValues(srcStrides.GetNumDimensions(),
	0,
	srcStrides,
	srcChannelStride,
	srcWidthStride,
	srcHeightStride,
	srcBatchStride,
	srcDepthStride);

	size_t dstDepth = 1;
	size_t dstBatches = 1;
	size_t dstHeight = 1;
	size_t dstWidth = 1;
	size_t dstChannels = 1;
	AssignValues(dstShape.GetNumDimensions(),
	0,
	dstShape,
	dstChannels,
	dstWidth,
	dstHeight,
	dstBatches,
	dstDepth);

	size_t dstDepthStride = 0;
	size_t dstBatchStride = 0;
	size_t dstHeightStride = 0;
	size_t dstWidthStride = 0;
	size_t dstChannelStride = 0;
	AssignValues(dstStrides.GetNumDimensions(),
	0,
	dstStrides,
	dstChannelStride,
	dstWidthStride,
	dstHeightStride,
	dstBatchStride,
	dstDepthStride);

	const unsigned char* srcDataStart;
	unsigned char* dstDataStart;
	{
	ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Synchronize buffers");
	srcDataStart = static_cast<const uint8_t*>(srcTensor->Map());
	dstDataStart = static_cast<uint8_t*>(dstTensor->Map());
	}
	if (srcDataStart == nullptr)
	{
	throw MemoryValidationException("The source tensor is null.");
	}
	if (dstDataStart == nullptr)
	{
	throw MemoryValidationException("The destination tensor is null.");
	}

	size_t copyLength = std::min(srcChannels * srcChannelStride, dstChannels * dstChannelStride);
	size_t copyWidth = std::min(srcWidth, dstWidth);
	size_t copyHeight = std::min(srcHeight, dstHeight);
	size_t copyBatches = std::min(srcBatches, dstBatches);
	size_t copyDepth = std::min(srcDepth, dstDepth);

	// Coalesce inner dimensions where possible
	// to reduce overheard calling copy() and to
	// allow for memory bandwidth optimisations
	if (copyLength == srcWidthStride &&
	copyLength == dstWidthStride)
	{
	// There is no special padding between rows,
	// and sizes are compatible, so copy whole rows
	copyLength *= copyWidth;
	copyWidth = 1;

	if (copyLength == srcHeightStride &&
	copyLength == dstHeightStride)
	{
	// There is no special padding between batches
	// and sizes are compatible so copy whole batches
	copyLength *= copyHeight;
	copyHeight = 1;
	}
	}

	const unsigned char* srcData = srcDataStart;
	unsigned char* dstData = dstDataStart;
	for (unsigned int d = 0; d < copyDepth; ++d)
	{
	auto srcPtrDepth = srcData;
	auto dstPtrDepth = dstData;
	for (unsigned int b = 0; b < copyBatches; ++b)
	{
	auto srcPtrBatch = srcData;
	auto dstPtrBatch = dstData;
	for (unsigned int h = 0; h < copyHeight; ++h)
	{
	auto srcPtrChannel = srcData;
	auto dstPtrChannel = dstData;
	for (unsigned int w = 0; w < copyWidth; ++w)
	{
	// Sanity check the memory area we've been asked to copy from and to.
	if (copyLength > srcSize)
	{
	throw MemoryValidationException(
	"The source tensor size does not match the size of the allocated tensor.");
	}
	if (copyLength > dstSize)
	{
	throw MemoryValidationException(
	"The destination tensor size will overrun the destination tensor.");
	}
	copy(dstData, srcData, copyLength);
	dstData += dstWidthStride;
	srcData += srcWidthStride;
	}
	dstData += (static_cast<long>(dstHeightStride) - (dstData - dstPtrChannel));
	srcData += (static_cast<long>(srcHeightStride) - (srcData - srcPtrChannel));
	}
	dstData += (static_cast<long>(dstBatchStride) - (dstData - dstPtrBatch));
	srcData += (static_cast<long>(srcBatchStride) - (srcData - srcPtrBatch));
	}
	dstData += (static_cast<long>(dstDepthStride) - (dstData - dstPtrDepth));
	srcData += (static_cast<long>(srcDepthStride) - (srcData - srcPtrDepth));
	}

	srcTensor->Unmap();
	dstTensor->Unmap();
	}

	template <typename SrcTensorHandleType, typename DstTensorHandleType, typename DescriptorType>
	void GatherTensorHandlePairs(const DescriptorType& descriptor,
	std::vector<std::pair<SrcTensorHandleType, DstTensorHandleType>>& tensorHandlePairs)
	{
	const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size());
	tensorHandlePairs.reserve(numInputs);

	for (unsigned int i = 0; i < numInputs; ++i)
	{
	SrcTensorHandleType* const srcTensorHandle =
	PolymorphicDowncast<SrcTensorHandleType*>(descriptor.m_Inputs[i]);
	DstTensorHandleType* const dstTensorHandle =
	PolymorphicDowncast<DstTensorHandleType*>(descriptor.m_Outputs[i]);

	tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle);
	}
	}

	int32_t ConvertMaskToACLFormat(int32_t mask, int32_t numDim);

	armnn::ConstTensor PermuteTensor(const ConstTensorHandle* tensor,
	const PermutationVector& permutationVector,
	void* permuteBuffer);

	void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout);

	TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout);

	/// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M]
	/// This function coverts a TensorInfo from [1,H,W,IM] to [1,IM,H,W] (if NCHW) or keeps it at [1,H,W,I*M] (if NHWC)
	/// as required by the compute library
	/// Returns a tuple of converted weights tensor info and depth multiplier
	std::tuple<TensorInfo, unsigned int> Convert1HWOTensorInfoToAcl(const TensorInfo& weightInfo,
	const TensorInfo& inputInfo,
	const DataLayout dataLayout);

	armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstTensorHandle* weightTensor,
	DataLayout dataLayout,
	void* permuteBuffer);

	/// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M]
	/// This function coverts a ConstCpuTensorHandle from [1,H,W,IM] to [1,IM,H,W] (if NCHW) or
	/// keeps it at [1,H,W,I*M] (if NHWC) as required by the compute library
	///
	/// \param weightTensor - ConstTensorHandle of weights tensor
	/// \param inputInfo - TensorInfo of input tensor
	/// \param dataLayout - DataLayout of the input tensor
	/// \param permuteBuffer - Pointer to memory with the size of tensor. Used for the permutation
	/// \return tuple of transformed weights-ConstTensor and depthwise multiplier
	std::tuple<ConstTensor, unsigned int> Convert1HWOTensorToAcl(const ConstTensorHandle* weightTensor,
	const TensorInfo& inputInfo,
	const DataLayout dataLayout,
	void* permuteBuffer);

	/// Converts a (weights) tensor from [1, H, W, I*M] = [1, H, W, O] to [M, I, H, W]
	///
	/// \param weightTensor - ConstTensorHandle of the weight tensor that should be converted
	/// \param inputInfo - TensorInfo of the corresponding input tensor
	/// \param dataLayout - DataLayout of the input tensor e.g. NHWC or NCHW
	/// \param permuteBuffer - Memory location with the same size as the weight tensor to write converted data to
	/// \return - A tuple of ConstTensor and unsigned int which is the converted weightTensor and the depthMultiplier
	std::tuple<ConstTensor, unsigned int> Convert1HWOtoMIHW(const ConstTensorHandle* weightTensor,
	const TensorInfo& inputInfo,
	const DataLayout& dataLayout,
	void* permuteBuffer);

	/// Calculates the key index values needed for GatherNd: N, ND, K, W, C (N is always 1)
	///
	/// \param inputInfo0 - TensorInfo of the corresponding input tensor: params
	/// \param inputInfo1 - TensorInfo of the corresponding input tensor: indices
	/// \return - A map with names and values for N, ND, K, W, C
	std::map<std::string, unsigned int> CalculateGatherNdKeyIndices(TensorInfo inputInfo0, TensorInfo inputInfo1);

	/// Generates a permutation vector of size rank that permutes the 2 most right dimensions
	///
	/// \param rank - Tensor rank, i.e. number of dimensions in the tensors
	/// \return - A permutation vector that permutes the 2 last dimensions
	armnn::PermutationVector GeneratePermutationVectorOnLastTwoDimensions(unsigned int rank);

	/// Calculates the axis values for split operation.
	///
	/// \param desc - Splitter Descriptor
	/// \param input - Input tensor shape
	/// \return - A set containing axis values of slitter operation
	std::set<unsigned int> ComputeSplitAxis(const armnn::SplitterDescriptor& desc, const TensorShape& input);

	} //namespace armnn