src/backends/reference/workloads/BatchMatMulImpl.cpp - ml/armnn - Gitiles

 //
 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #include "BatchMatMulImpl.hpp"

 #include <armnn/backends/WorkloadData.hpp>
 #include <armnn/Logging.hpp>
 #include <armnnUtils/Permute.hpp>

 namespace armnn
 {

 BatchMatMul::BatchMatMul(const BatchMatMulDescriptor& params,
                          const TensorInfo& inputXInfo,
                          const TensorInfo& inputYInfo,
                          const TensorInfo& outputInfo,
                          Decoder<float>& inputXDecoder,
                          Decoder<float>& inputYDecoder,
                          Encoder<float>& outputEncoder)
     : params(params),
       inputXInfo(inputXInfo),
       inputYInfo(inputYInfo),
       outputInfo(outputInfo),
       inputXDecoder(inputXDecoder),
       inputYDecoder(inputYDecoder),
       outputEncoder(outputEncoder)
 {
     inputXData = this->inputXDecoder.DecodeTensor(inputXInfo.GetShape());
     inputYData = this->inputYDecoder.DecodeTensor(inputYInfo.GetShape());
     // At this point, we don't touch the input decoders - just the resultant vectors

     ApplyParams();

     ApplyBatchMatMul();
 }

 void BatchMatMul::ApplyBatchMatMul()
 {
     auto axesXToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutX,
                                                           inputXInfo.GetShape());
     auto axesYToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutY,
                                                           inputYInfo.GetShape());
     AdjustAxesToMulForUnequalRanks(axesXToMul, axesYToMul);

     unsigned int inputXColDim = axesXToMul.second;
     unsigned int inputYRowDim = axesYToMul.first;

     unsigned int inputYRowSize = inputYInfo.GetShape()[inputYRowDim];

     auto batchMatMulOperation = [&](const std::vector<unsigned int>& curIdx)
     {
         float sum = 0.0f;

         // InputYRowSize is synonymous with inputXColSize
         for (unsigned int inputYRowIdx = 0; inputYRowIdx < inputYRowSize; inputYRowIdx++) {
             auto xIdx = curIdx;
             xIdx[inputXColDim] = inputYRowIdx;

             auto yIdx = curIdx;
             yIdx[inputYRowDim] = inputYRowIdx;

             sum += (GetValueAt(DataSlot::InputX, xIdx) * GetValueAt(DataSlot::InputY, yIdx));
         }

         SetValueAt(sum, DataSlot::Output, curIdx);
     };

     auto startIdx = std::vector<unsigned int>(outputInfo.GetNumDimensions(), 0);
     RecurseTensor(outputInfo,
                   batchMatMulOperation,
                   startIdx,
                   0);
 }

 void BatchMatMul::ApplyParams()
 {
     if(params.m_TransposeX)
     {
         Transpose(DataSlot::InputX);
     }
     else if(params.m_AdjointX)
     {
         Adjoint(DataSlot::InputX);
     }
     if(params.m_TransposeY)
     {
         Transpose(DataSlot::InputY);
     }
     else if(params.m_AdjointY)
     {
         Adjoint(DataSlot::InputY);
     }
 }

 void BatchMatMul::Transpose(DataSlot type)
 {
     // AKA the permute of the tensor
     // This modifies the tensor's info.

     switch(type)
     {
         case DataSlot::InputX:
         {
             auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutX,
                                                                    inputXInfo.GetShape());
             inputXInfo = armnnUtils::Permuted(inputXInfo, permuteVec);
             std::vector<float> temp(inputXData.size());
             armnnUtils::Permute(inputXInfo.GetShape(),
                                 permuteVec,
                                 inputXData.data(),
                                 temp.data(),
                                 sizeof(float));
             inputXData = temp;
             break;
         }
         case DataSlot::InputY:
         {
             auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutY,
                                                                    inputYInfo.GetShape());
             inputYInfo = armnnUtils::Permuted(inputYInfo, permuteVec);
             std::vector<float> temp(inputYData.size());
             armnnUtils::Permute(inputYInfo.GetShape(),
                                 permuteVec,
                                 inputYData.data(),
                                 temp.data(),
                                 sizeof(float));
             inputYData = temp;
             break;
         }
         case DataSlot::Output: // We needn't transpose the output tensor
         default:
             break;
     }
 }

 void BatchMatMul::Adjoint(DataSlot type)
 {
     // Finding the adjoint of a square matrix:
     // Calculate the cofactor of each element (using Gauss elimination here)
     // Apply a transpose to it (this also modifies the tensor's info)

     TensorInfo& inputInfo = (type == DataSlot::InputX) ? inputXInfo : inputYInfo;
     const auto& dataLayout = (type == DataSlot::InputX) ? params.m_DataLayoutX : params.m_DataLayoutY;
     const auto axesToAdjoint = BatchMatMulDescriptor::GetAxesToMul(dataLayout,inputInfo.GetShape());

     ARMNN_ASSERT(inputInfo.GetShape()[axesToAdjoint.first] == inputInfo.GetShape()[axesToAdjoint.second]);
     // We grab a copy of the tensor data to prevent overwriting
     std::vector<float> inputDataClone = (type == DataSlot::InputX) ? inputXData : inputYData;

     // The sub-matrix is the resultant matrix when the row and column of the current index is removed
     unsigned int subMatAxisSize = inputInfo.GetShape()[axesToAdjoint.first] - 1;
     std::vector<std::vector<float>> subMat(subMatAxisSize,
                                            std::vector<float>(subMatAxisSize));

     // Lambdas for each sub-step of the cofactor operation
     auto almostEquals = [&](const float& a, const float& b, float unitsInLastPlace = 2.0f)
     {
         float diff = std::fabs(a-b);
         float bound = diff * std::numeric_limits<float>::epsilon() * unitsInLastPlace;
         return (diff <= bound) || (diff < std::numeric_limits<float>::min());
     };

     float swapMultiplier = std::numeric_limits<float>::max();
     auto swapRows = [&](unsigned int rowIdxA, unsigned int rowIdxB)
     {
         // Every row swap flips this around by the negative (set to 1 at the beginning of each cofactor op run)
         for(unsigned int colIdx = 0; colIdx < subMatAxisSize; colIdx++)
         {
             float tmp = subMat[rowIdxA][colIdx];
             subMat[rowIdxA][colIdx] = subMat[rowIdxB][colIdx];
             subMat[rowIdxB][colIdx] = tmp;
         }
         swapMultiplier *= -1.0f;
     };

     auto findNextValidPivotRowIdx = [&](unsigned int colIdx)
     {
         unsigned int result = std::numeric_limits<unsigned int>::max();

         // The original diagonal has been checked and is invalid
         for(unsigned int rowIdx = colIdx+1; rowIdx < subMatAxisSize; rowIdx++)
         {
             if(!almostEquals(subMat[rowIdx][colIdx], 0.0f))
             {
                 result = rowIdx;
                 break;
             }
         }
         return result;
     };

     auto eliminate = [&](const float& pivot, unsigned int pivotPos)
     {
         for(unsigned int rowIdx = pivotPos+1; rowIdx < subMatAxisSize; rowIdx++)
         {
             float multiplierNumerator = subMat[rowIdx][pivotPos];
             if(almostEquals(multiplierNumerator, 0.0f))
             {
                 continue;
             }
             float multiplier = multiplierNumerator / pivot; // Susceptible to floating point inaccuracies
                                                             // Hence the almostEquals usage to counteract this
             for(unsigned int colIdx = pivotPos; colIdx < subMatAxisSize; colIdx++)
             {
                 // We start at col=pivotPos as we have assumed that all elements
                 // to our left have been eliminated to zero already

                 // We subtract based on the element directly above us in our pivot row
                 subMat[rowIdx][colIdx] -= multiplier * subMat[pivotPos][colIdx];
             }
         }
     };

     auto cofactorOperation = [&](const std::vector<unsigned int>& curIdx)
     {
         auto row = curIdx[axesToAdjoint.first];
         auto col = curIdx[axesToAdjoint.second];

         float minorMultiplier = static_cast<float>(std::pow(-1, (row + 1 + col + 1)));

         for(unsigned int subRow = 0; subRow < subMatAxisSize; subRow++)
         {
             for(unsigned int subCol = 0; subCol < subMatAxisSize; subCol++)
             {
                 unsigned int outerRow = (subRow >= row)?subRow + 1:subRow;
                 unsigned int outerCol = (subCol >= col)?subCol + 1:subCol;
                 auto cloneIdx = curIdx;
                 cloneIdx[axesToAdjoint.first] = outerRow;
                 cloneIdx[axesToAdjoint.second] = outerCol;
                 subMat[subRow][subCol] = GetValueAt(type,cloneIdx,inputDataClone);
             }
         }

         float determinant = 1.0f;

         // Cover the edge cases and simple base cases before resorting to Gauss elimination for larger matrices
         switch(subMatAxisSize)
         {
             case 0:
             {
                 determinant = GetValueAt(type, curIdx, inputDataClone);
                 break;
             }
             case 1:
             {
                 // If the resultant sub-matrix is just one element - that's the determinant
                 determinant = subMat[0][0];
                 break;
             }
             case 2:
             {
                 // For a 2x2 sub-matrix, the determinant is just a*d-b*c
                 determinant = subMat[0][0] * subMat[1][1] -
                               subMat[0][1] * subMat[1][0];
                 break;
             }
             default:
             {
                 // Gaussian elimination to find the determinant of this sub-matrix
                 swapMultiplier = 1.0f;
                 // March diagonally down the pivots and if it's invalid (a zero), swap the row with the
                 // nearest non-zero down within the column
                 for(unsigned int pivotRow = 0, pivotCol = 0;
                     pivotRow < subMatAxisSize;
                     pivotRow++, pivotCol++)
                 {
                     float& pivot = subMat[pivotRow][pivotCol];

                     if(almostEquals(pivot, 0.0f))
                     {
                         unsigned int nextValidPivotRowIdx = findNextValidPivotRowIdx(pivotCol);
                         if(nextValidPivotRowIdx == std::numeric_limits<unsigned int>::max())
                         {
                             // No valid pivot down this column, which means that this pivot remains a zero.
                             // This results in the determinant for this entire sub-matrix to just be zero.
                             determinant = 0.0f;
                             break;
                         }
                         swapRows(pivotRow, nextValidPivotRowIdx);
                     }
                     determinant *= pivot;
                     // The actual elimination bit (which will update/propagate to the pivots down the line)
                     eliminate(pivot, pivotRow); // Synonymous with pivotCol
                 }

                 determinant *= swapMultiplier;
                 break;
             }
         }
         float cofactor = minorMultiplier * determinant;
         SetValueAt(cofactor, type, curIdx);
     };

     auto startIdx = std::vector<unsigned int>(inputInfo.GetNumDimensions(), 0);
     RecurseTensor(inputInfo,
                   cofactorOperation,
                   startIdx,
                   0);

     Transpose(type);
 }

 void BatchMatMul::RecurseTensor(const TensorInfo& tensorInfo,
                                 const std::function<void(const std::vector<unsigned int>&)>& operation,
                                 std::vector<unsigned int>& curIdx,
                                 unsigned int curDim)
 {
     if(!(curDim < tensorInfo.GetNumDimensions()))
     {
         // We're at the leaf level of this call tree, so we operate here (each leaf is a data point)
         operation(curIdx);
         return;
     }

     for(unsigned int i = 0; i < tensorInfo.GetShape()[curDim]; i++)
     {
         curIdx[curDim] = i;
         RecurseTensor(tensorInfo,
                       operation,
                       curIdx,
                       curDim + 1);
     }
 }

 void BatchMatMul::AdjustAxesToMulForUnequalRanks(std::pair<unsigned int, unsigned int>& axesXToMul,
                                                  std::pair<unsigned int, unsigned int>& axesYToMul)
 {
     int rankDiff = static_cast<int>(inputXInfo.GetNumDimensions()) -
                    static_cast<int>(inputYInfo.GetNumDimensions());
     if(rankDiff == 0)
     {
         return;
     }
     else if(rankDiff < 0)
     {
         // Y is the larger one
         axesXToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
         axesXToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
     }
     else if(rankDiff > 0)
     {
         // X is the larger one
         axesYToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
         axesYToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
     }
 }

 float BatchMatMul::GetValueAt(DataSlot type, std::vector<unsigned int> idx, const std::vector<float>& customData)
 {
     // This gets the data from the input vector that we have, Not the decoder
     // But for the output, it is operating on the encoder itself

     AdjustToSafeIdx(type, idx);
     unsigned int flatIdx = CalcFlatIdx(type, idx);
     float value = 0.0f;
     switch(type)
     {
         case DataSlot::InputX:
             value = customData.empty() ? inputXData[flatIdx] : customData[flatIdx];
             break;
         case DataSlot::InputY:
             value = customData.empty() ? inputYData[flatIdx] : customData[flatIdx];
             break;
         case DataSlot::Output:
             outputEncoder[flatIdx];
             value = outputEncoder.Get();
             break;
         default:
             break;
     }

     return value;
 }

 void BatchMatMul::SetValueAt(float value, DataSlot type, std::vector<unsigned int> idx)
 {
     AdjustToSafeIdx(type, idx);
     unsigned int flatIdx = CalcFlatIdx(type, idx);
     switch(type)
     {
         case DataSlot::InputX:
             inputXData[flatIdx] = value;
             break;
         case DataSlot::InputY:
             inputYData[flatIdx] = value;
             break;
         case DataSlot::Output:
             outputEncoder[flatIdx];
             outputEncoder.Set(value);
             break;
         default:
             break;
     }
 }

 void BatchMatMul::AdjustToSafeIdx(DataSlot type, std::vector<unsigned int>& idx)
 {
     for(unsigned int dim = 0; dim < idx.size(); dim++)
     {
         switch(type)
         {
             case DataSlot::InputX:
             {
                 auto xRank = inputXInfo.GetNumDimensions();
                 auto xDiff = outputInfo.GetNumDimensions() - xRank;
                 if (dim < xDiff ||
                     idx[dim] > inputXInfo.GetShape()[dim-xDiff]-1)
                 {
                     idx[dim] = 0; // Broadcasting
                 }
                 break;
             }
             case DataSlot::InputY:
             {
                 auto yRank = inputYInfo.GetNumDimensions();
                 auto yDiff = outputInfo.GetNumDimensions() - yRank;
                 if (dim < yDiff ||
                     idx[dim] > inputYInfo.GetShape()[dim-yDiff]-1)
                 {
                     idx[dim] = 0;
                 }
                 break;
             }
             case DataSlot::Output:
             {
                 // Our indices are based off the output
                 break;
             }
             default:
                 break;
         }
     }
 }

 unsigned int BatchMatMul::CalcFlatIdx(DataSlot type, const std::vector<unsigned int>& idx)
 {
     unsigned int result = idx[idx.size()-1];
     unsigned int dimMultiplier = 1;
     unsigned int offset;

     // -2 because final dim is already accounted for in the multiplier (last dim is just a multiplier of 1x)
     for(unsigned int i = static_cast<unsigned int>(idx.size()-2); static_cast<int>(i) >= 0; i--)
     {
         switch(type)
         {
             case DataSlot::InputX:
                 offset = outputInfo.GetNumDimensions() - inputXInfo.GetNumDimensions();
                 dimMultiplier *= inputXInfo.GetShape()[i + 1 - offset];
                 break;
             case DataSlot::InputY:
                 offset = outputInfo.GetNumDimensions() - inputYInfo.GetNumDimensions();
                 dimMultiplier *= inputYInfo.GetShape()[i + 1 - offset];
                 break;
             case DataSlot::Output:
                 dimMultiplier *= outputInfo.GetShape()[i+1];
                 break;
             default:
                 break;
         }
         result += (idx[i] * dimMultiplier);
     }
     return result;
 }

 } // namespace armnn
	//
	// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#include "BatchMatMulImpl.hpp"

	#include <armnn/backends/WorkloadData.hpp>
	#include <armnn/Logging.hpp>
	#include <armnnUtils/Permute.hpp>

	namespace armnn
	{

	BatchMatMul::BatchMatMul(const BatchMatMulDescriptor& params,
	const TensorInfo& inputXInfo,
	const TensorInfo& inputYInfo,
	const TensorInfo& outputInfo,
	Decoder<float>& inputXDecoder,
	Decoder<float>& inputYDecoder,
	Encoder<float>& outputEncoder)
	: params(params),
	inputXInfo(inputXInfo),
	inputYInfo(inputYInfo),
	outputInfo(outputInfo),
	inputXDecoder(inputXDecoder),
	inputYDecoder(inputYDecoder),
	outputEncoder(outputEncoder)
	{
	inputXData = this->inputXDecoder.DecodeTensor(inputXInfo.GetShape());
	inputYData = this->inputYDecoder.DecodeTensor(inputYInfo.GetShape());
	// At this point, we don't touch the input decoders - just the resultant vectors

	ApplyParams();

	ApplyBatchMatMul();
	}

	void BatchMatMul::ApplyBatchMatMul()
	{
	auto axesXToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutX,
	inputXInfo.GetShape());
	auto axesYToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutY,
	inputYInfo.GetShape());
	AdjustAxesToMulForUnequalRanks(axesXToMul, axesYToMul);

	unsigned int inputXColDim = axesXToMul.second;
	unsigned int inputYRowDim = axesYToMul.first;

	unsigned int inputYRowSize = inputYInfo.GetShape()[inputYRowDim];

	auto batchMatMulOperation = [&](const std::vector<unsigned int>& curIdx)
	{
	float sum = 0.0f;

	// InputYRowSize is synonymous with inputXColSize
	for (unsigned int inputYRowIdx = 0; inputYRowIdx < inputYRowSize; inputYRowIdx++) {
	auto xIdx = curIdx;
	xIdx[inputXColDim] = inputYRowIdx;

	auto yIdx = curIdx;
	yIdx[inputYRowDim] = inputYRowIdx;

	sum += (GetValueAt(DataSlot::InputX, xIdx) * GetValueAt(DataSlot::InputY, yIdx));
	}

	SetValueAt(sum, DataSlot::Output, curIdx);
	};

	auto startIdx = std::vector<unsigned int>(outputInfo.GetNumDimensions(), 0);
	RecurseTensor(outputInfo,
	batchMatMulOperation,
	startIdx,
	0);
	}

	void BatchMatMul::ApplyParams()
	{
	if(params.m_TransposeX)
	{
	Transpose(DataSlot::InputX);
	}
	else if(params.m_AdjointX)
	{
	Adjoint(DataSlot::InputX);
	}
	if(params.m_TransposeY)
	{
	Transpose(DataSlot::InputY);
	}
	else if(params.m_AdjointY)
	{
	Adjoint(DataSlot::InputY);
	}
	}

	void BatchMatMul::Transpose(DataSlot type)
	{
	// AKA the permute of the tensor
	// This modifies the tensor's info.

	switch(type)
	{
	case DataSlot::InputX:
	{
	auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutX,
	inputXInfo.GetShape());
	inputXInfo = armnnUtils::Permuted(inputXInfo, permuteVec);
	std::vector<float> temp(inputXData.size());
	armnnUtils::Permute(inputXInfo.GetShape(),
	permuteVec,
	inputXData.data(),
	temp.data(),
	sizeof(float));
	inputXData = temp;
	break;
	}
	case DataSlot::InputY:
	{
	auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutY,
	inputYInfo.GetShape());
	inputYInfo = armnnUtils::Permuted(inputYInfo, permuteVec);
	std::vector<float> temp(inputYData.size());
	armnnUtils::Permute(inputYInfo.GetShape(),
	permuteVec,
	inputYData.data(),
	temp.data(),
	sizeof(float));
	inputYData = temp;
	break;
	}
	case DataSlot::Output: // We needn't transpose the output tensor
	default:
	break;
	}
	}

	void BatchMatMul::Adjoint(DataSlot type)
	{
	// Finding the adjoint of a square matrix:
	// Calculate the cofactor of each element (using Gauss elimination here)
	// Apply a transpose to it (this also modifies the tensor's info)

	TensorInfo& inputInfo = (type == DataSlot::InputX) ? inputXInfo : inputYInfo;
	const auto& dataLayout = (type == DataSlot::InputX) ? params.m_DataLayoutX : params.m_DataLayoutY;
	const auto axesToAdjoint = BatchMatMulDescriptor::GetAxesToMul(dataLayout,inputInfo.GetShape());

	ARMNN_ASSERT(inputInfo.GetShape()[axesToAdjoint.first] == inputInfo.GetShape()[axesToAdjoint.second]);
	// We grab a copy of the tensor data to prevent overwriting
	std::vector<float> inputDataClone = (type == DataSlot::InputX) ? inputXData : inputYData;

	// The sub-matrix is the resultant matrix when the row and column of the current index is removed
	unsigned int subMatAxisSize = inputInfo.GetShape()[axesToAdjoint.first] - 1;
	std::vector<std::vector<float>> subMat(subMatAxisSize,
	std::vector<float>(subMatAxisSize));

	// Lambdas for each sub-step of the cofactor operation
	auto almostEquals = [&](const float& a, const float& b, float unitsInLastPlace = 2.0f)
	{
	float diff = std::fabs(a-b);
	float bound = diff * std::numeric_limits<float>::epsilon() * unitsInLastPlace;
	return (diff <= bound) \|\| (diff < std::numeric_limits<float>::min());
	};

	float swapMultiplier = std::numeric_limits<float>::max();
	auto swapRows = [&](unsigned int rowIdxA, unsigned int rowIdxB)
	{
	// Every row swap flips this around by the negative (set to 1 at the beginning of each cofactor op run)
	for(unsigned int colIdx = 0; colIdx < subMatAxisSize; colIdx++)
	{
	float tmp = subMat[rowIdxA][colIdx];
	subMat[rowIdxA][colIdx] = subMat[rowIdxB][colIdx];
	subMat[rowIdxB][colIdx] = tmp;
	}
	swapMultiplier *= -1.0f;
	};

	auto findNextValidPivotRowIdx = [&](unsigned int colIdx)
	{
	unsigned int result = std::numeric_limits<unsigned int>::max();

	// The original diagonal has been checked and is invalid
	for(unsigned int rowIdx = colIdx+1; rowIdx < subMatAxisSize; rowIdx++)
	{
	if(!almostEquals(subMat[rowIdx][colIdx], 0.0f))
	{
	result = rowIdx;
	break;
	}
	}
	return result;
	};

	auto eliminate = [&](const float& pivot, unsigned int pivotPos)
	{
	for(unsigned int rowIdx = pivotPos+1; rowIdx < subMatAxisSize; rowIdx++)
	{
	float multiplierNumerator = subMat[rowIdx][pivotPos];
	if(almostEquals(multiplierNumerator, 0.0f))
	{
	continue;
	}
	float multiplier = multiplierNumerator / pivot; // Susceptible to floating point inaccuracies
	// Hence the almostEquals usage to counteract this
	for(unsigned int colIdx = pivotPos; colIdx < subMatAxisSize; colIdx++)
	{
	// We start at col=pivotPos as we have assumed that all elements
	// to our left have been eliminated to zero already

	// We subtract based on the element directly above us in our pivot row
	subMat[rowIdx][colIdx] -= multiplier * subMat[pivotPos][colIdx];
	}
	}
	};

	auto cofactorOperation = [&](const std::vector<unsigned int>& curIdx)
	{
	auto row = curIdx[axesToAdjoint.first];
	auto col = curIdx[axesToAdjoint.second];

	float minorMultiplier = static_cast<float>(std::pow(-1, (row + 1 + col + 1)));

	for(unsigned int subRow = 0; subRow < subMatAxisSize; subRow++)
	{
	for(unsigned int subCol = 0; subCol < subMatAxisSize; subCol++)
	{
	unsigned int outerRow = (subRow >= row)?subRow + 1:subRow;
	unsigned int outerCol = (subCol >= col)?subCol + 1:subCol;
	auto cloneIdx = curIdx;
	cloneIdx[axesToAdjoint.first] = outerRow;
	cloneIdx[axesToAdjoint.second] = outerCol;
	subMat[subRow][subCol] = GetValueAt(type,cloneIdx,inputDataClone);
	}
	}

	float determinant = 1.0f;

	// Cover the edge cases and simple base cases before resorting to Gauss elimination for larger matrices
	switch(subMatAxisSize)
	{
	case 0:
	{
	determinant = GetValueAt(type, curIdx, inputDataClone);
	break;
	}
	case 1:
	{
	// If the resultant sub-matrix is just one element - that's the determinant
	determinant = subMat[0][0];
	break;
	}
	case 2:
	{
	// For a 2x2 sub-matrix, the determinant is just ad-bc
	determinant = subMat[0][0] * subMat[1][1] -
	subMat[0][1] * subMat[1][0];
	break;
	}
	default:
	{
	// Gaussian elimination to find the determinant of this sub-matrix
	swapMultiplier = 1.0f;
	// March diagonally down the pivots and if it's invalid (a zero), swap the row with the
	// nearest non-zero down within the column
	for(unsigned int pivotRow = 0, pivotCol = 0;
	pivotRow < subMatAxisSize;
	pivotRow++, pivotCol++)
	{
	float& pivot = subMat[pivotRow][pivotCol];

	if(almostEquals(pivot, 0.0f))
	{
	unsigned int nextValidPivotRowIdx = findNextValidPivotRowIdx(pivotCol);
	if(nextValidPivotRowIdx == std::numeric_limits<unsigned int>::max())
	{
	// No valid pivot down this column, which means that this pivot remains a zero.
	// This results in the determinant for this entire sub-matrix to just be zero.
	determinant = 0.0f;
	break;
	}
	swapRows(pivotRow, nextValidPivotRowIdx);
	}
	determinant *= pivot;
	// The actual elimination bit (which will update/propagate to the pivots down the line)
	eliminate(pivot, pivotRow); // Synonymous with pivotCol
	}

	determinant *= swapMultiplier;
	break;
	}
	}
	float cofactor = minorMultiplier * determinant;
	SetValueAt(cofactor, type, curIdx);
	};

	auto startIdx = std::vector<unsigned int>(inputInfo.GetNumDimensions(), 0);
	RecurseTensor(inputInfo,
	cofactorOperation,
	startIdx,
	0);

	Transpose(type);
	}

	void BatchMatMul::RecurseTensor(const TensorInfo& tensorInfo,
	const std::function<void(const std::vector<unsigned int>&)>& operation,
	std::vector<unsigned int>& curIdx,
	unsigned int curDim)
	{
	if(!(curDim < tensorInfo.GetNumDimensions()))
	{
	// We're at the leaf level of this call tree, so we operate here (each leaf is a data point)
	operation(curIdx);
	return;
	}

	for(unsigned int i = 0; i < tensorInfo.GetShape()[curDim]; i++)
	{
	curIdx[curDim] = i;
	RecurseTensor(tensorInfo,
	operation,
	curIdx,
	curDim + 1);
	}
	}

	void BatchMatMul::AdjustAxesToMulForUnequalRanks(std::pair<unsigned int, unsigned int>& axesXToMul,
	std::pair<unsigned int, unsigned int>& axesYToMul)
	{
	int rankDiff = static_cast<int>(inputXInfo.GetNumDimensions()) -
	static_cast<int>(inputYInfo.GetNumDimensions());
	if(rankDiff == 0)
	{
	return;
	}
	else if(rankDiff < 0)
	{
	// Y is the larger one
	axesXToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
	axesXToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
	}
	else if(rankDiff > 0)
	{
	// X is the larger one
	axesYToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
	axesYToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));
	}
	}

	float BatchMatMul::GetValueAt(DataSlot type, std::vector<unsigned int> idx, const std::vector<float>& customData)
	{
	// This gets the data from the input vector that we have, Not the decoder
	// But for the output, it is operating on the encoder itself

	AdjustToSafeIdx(type, idx);
	unsigned int flatIdx = CalcFlatIdx(type, idx);
	float value = 0.0f;
	switch(type)
	{
	case DataSlot::InputX:
	value = customData.empty() ? inputXData[flatIdx] : customData[flatIdx];
	break;
	case DataSlot::InputY:
	value = customData.empty() ? inputYData[flatIdx] : customData[flatIdx];
	break;
	case DataSlot::Output:
	outputEncoder[flatIdx];
	value = outputEncoder.Get();
	break;
	default:
	break;
	}

	return value;
	}

	void BatchMatMul::SetValueAt(float value, DataSlot type, std::vector<unsigned int> idx)
	{
	AdjustToSafeIdx(type, idx);
	unsigned int flatIdx = CalcFlatIdx(type, idx);
	switch(type)
	{
	case DataSlot::InputX:
	inputXData[flatIdx] = value;
	break;
	case DataSlot::InputY:
	inputYData[flatIdx] = value;
	break;
	case DataSlot::Output:
	outputEncoder[flatIdx];
	outputEncoder.Set(value);
	break;
	default:
	break;
	}
	}

	void BatchMatMul::AdjustToSafeIdx(DataSlot type, std::vector<unsigned int>& idx)
	{
	for(unsigned int dim = 0; dim < idx.size(); dim++)
	{
	switch(type)
	{
	case DataSlot::InputX:
	{
	auto xRank = inputXInfo.GetNumDimensions();
	auto xDiff = outputInfo.GetNumDimensions() - xRank;
	if (dim < xDiff \|\|
	idx[dim] > inputXInfo.GetShape()[dim-xDiff]-1)
	{
	idx[dim] = 0; // Broadcasting
	}
	break;
	}
	case DataSlot::InputY:
	{
	auto yRank = inputYInfo.GetNumDimensions();
	auto yDiff = outputInfo.GetNumDimensions() - yRank;
	if (dim < yDiff \|\|
	idx[dim] > inputYInfo.GetShape()[dim-yDiff]-1)
	{
	idx[dim] = 0;
	}
	break;
	}
	case DataSlot::Output:
	{
	// Our indices are based off the output
	break;
	}
	default:
	break;
	}
	}
	}

	unsigned int BatchMatMul::CalcFlatIdx(DataSlot type, const std::vector<unsigned int>& idx)
	{
	unsigned int result = idx[idx.size()-1];
	unsigned int dimMultiplier = 1;
	unsigned int offset;

	// -2 because final dim is already accounted for in the multiplier (last dim is just a multiplier of 1x)
	for(unsigned int i = static_cast<unsigned int>(idx.size()-2); static_cast<int>(i) >= 0; i--)
	{
	switch(type)
	{
	case DataSlot::InputX:
	offset = outputInfo.GetNumDimensions() - inputXInfo.GetNumDimensions();
	dimMultiplier *= inputXInfo.GetShape()[i + 1 - offset];
	break;
	case DataSlot::InputY:
	offset = outputInfo.GetNumDimensions() - inputYInfo.GetNumDimensions();
	dimMultiplier *= inputYInfo.GetShape()[i + 1 - offset];
	break;
	case DataSlot::Output:
	dimMultiplier *= outputInfo.GetShape()[i+1];
	break;
	default:
	break;
	}
	result += (idx[i] * dimMultiplier);
	}
	return result;
	}

	} // namespace armnn