IVGCVSW-2354 Caffe SqueezeNet through armnn Cl and Neon

 * Compute Softmax Acl axis for Cl and Neon
 * Add unittests for Softmax in 3D and 4D
 * Correct input and output layer names in CaffeSqueezeNet inference test

Change-Id: I2d369d9a2db19c40f2af3341039dd33f0c5637b1
Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp
index 3d5b9ca..b4673f7 100644
--- a/src/backends/aclCommon/ArmComputeUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeUtils.hpp
@@ -120,4 +120,14 @@
     return fc_info;
 }
 
+inline unsigned int ComputeSoftmaxAclAxis(const armnn::TensorInfo& tensor)
+{
+    unsigned int dim = tensor.GetNumDimensions();
+
+    BOOST_ASSERT(dim != 0);
+
+    // Currently ArmNN support axis 1.
+    return dim - 1;
+}
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index ce02fed..a088aaa 100644
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -907,6 +907,22 @@
     return SimpleSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta);
 }
 
+LayerTestResult<float,3> Simple3dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    return Simple3dSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta);
+}
+
+LayerTestResult<float,4> Simple4dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    return Simple4dSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta);
+}
+
 LayerTestResult<uint8_t,2> SimpleSoftmaxUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -915,6 +931,22 @@
     return SimpleSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, beta);
 }
 
+LayerTestResult<uint8_t,3> Simple3dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    return Simple3dSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, beta);
+}
+
+LayerTestResult<uint8_t,4> Simple4dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    return Simple4dSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, beta);
+}
+
 LayerTestResult<float,4> CompareNormalizationTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 587ffe9..a8cb553 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -335,11 +335,31 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     float beta);
 
+LayerTestResult<float, 3> Simple3dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<float, 4> Simple4dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
 LayerTestResult<uint8_t, 2> SimpleSoftmaxUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     float beta);
 
+LayerTestResult<uint8_t,3> Simple3dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<uint8_t,4> Simple4dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
 LayerTestResult<float, 4> SimpleSigmoidTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/SoftmaxTestImpl.hpp b/src/backends/backendsCommon/test/SoftmaxTestImpl.hpp
index 25ceda1..8081950 100644
--- a/src/backends/backendsCommon/test/SoftmaxTestImpl.hpp
+++ b/src/backends/backendsCommon/test/SoftmaxTestImpl.hpp
@@ -19,34 +19,35 @@
 
 #include <algorithm>
 
-template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
-LayerTestResult<T, 2> SimpleSoftmaxTestImpl(
+template<armnn::DataType ArmnnType, std::size_t n, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, n> SimpleSoftmaxBaseTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-    float beta)
+    float beta,
+    const armnn::TensorShape& inputShape,
+    const std::vector<float>& outputData)
 {
     using std::exp;
 
+    const float qScale = 1.f / 256.f;
+    const int qOffset = 0;
+
     armnn::TensorInfo inputTensorInfo;
     armnn::TensorInfo outputTensorInfo;
 
-    unsigned int inputShape[] = { 2, 4 };
-
-    inputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
-    float qScale = 1.f / 256.f;
-    int qOffset = 0;
+    inputTensorInfo = armnn::TensorInfo(inputShape, ArmnnType);
     inputTensorInfo.SetQuantizationScale(qScale);
     inputTensorInfo.SetQuantizationOffset(qOffset);
 
-    outputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(inputShape, ArmnnType);
     outputTensorInfo.SetQuantizationScale(qScale);
     outputTensorInfo.SetQuantizationOffset(qOffset);
 
-    LayerTestResult<T, 2> ret(outputTensorInfo);
+    LayerTestResult<T, n> ret(outputTensorInfo);
 
     // Each row is independently softmax'd.
-    auto input = MakeTensor<T, 2>(inputTensorInfo, std::vector<T>(
-        QuantizedVector<T>(qScale, 0, {
+    auto input = MakeTensor<T, n>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
             0.f, 1.f, 0.f, 0.f,
             .5f, 0.f, 0.f, 0.f,
         })));
@@ -65,35 +66,76 @@
 
     inputHandle->Allocate();
     outputHandle->Allocate();
-    CopyDataToITensorHandle(inputHandle.get(), &input[0][0]);
+    CopyDataToITensorHandle(inputHandle.get(), input.origin());
+
+    BOOST_ASSERT(workload);
 
     ExecuteWorkload(*workload, memoryManager);
 
-    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+    CopyDataFromITensorHandle(ret.output.origin(), outputHandle.get());
 
-    float x0[4] = { exp((0.f - 1.0f) * beta), exp((1.0f - 1.0f) * beta),
-        exp((0.0f - 1.0f) * beta), exp((0.0f - 1.0f) * beta) };
-    float sum0 = x0[0] + x0[1] + x0[2] + x0[3];
-    float x1[4] = { exp((0.5f - 0.5f) * beta), exp((0.0f - 0.5f) * beta),
-        exp((0.0f - 0.5f) * beta), exp((0.0f - 0.5f) * beta) };
-    float sum1 = x1[0] + x1[1] + x1[2] + x1[3];
-
-    ret.outputExpected = MakeTensor<T, 2>(outputTensorInfo, std::vector<T>(
-        QuantizedVector<T>(qScale, qOffset, {
-        x0[0] / sum0, x0[1] / sum0, x0[2] / sum0, x0[3] / sum0,
-        x1[0] / sum1, x1[1] / sum1, x1[2] / sum1, x1[3] / sum1
-        })));
+    std::vector<T> expectedOutput = std::vector<T>(
+            QuantizedVector<T>(qScale, qOffset, outputData));
+    ret.outputExpected = MakeTensor<T, n>(outputTensorInfo, expectedOutput);
 
     return ret;
 }
 
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
-LayerTestResult<T, 2> CompareSoftmaxTestImpl(
+LayerTestResult<T, 2> SimpleSoftmaxTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
-    armnn::IWorkloadFactory& refWorkloadFactory,
     float beta)
 {
+    using std::exp;
+    const armnn::TensorShape inputShape{ 2, 4 };
+
+    float x0[4] = { exp((0.f - 1.0f) * beta), exp((1.0f - 1.0f) * beta),
+                    exp((0.0f - 1.0f) * beta), exp((0.0f - 1.0f) * beta) };
+    float sum0 = x0[0] + x0[1] + x0[2] + x0[3];
+    float x1[4] = { exp((0.5f - 0.5f) * beta), exp((0.0f - 0.5f) * beta),
+                    exp((0.0f - 0.5f) * beta), exp((0.0f - 0.5f) * beta) };
+    float sum1 = x1[0] + x1[1] + x1[2] + x1[3];
+
+    const std::vector<float> outputData = { x0[0] / sum0, x0[1] / sum0, x0[2] / sum0, x0[3] / sum0,
+                                            x1[0] / sum1, x1[1] / sum1, x1[2] / sum1, x1[3] / sum1 };
+
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 2>(workloadFactory, memoryManager, beta, inputShape, outputData);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Simple3dSoftmaxTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta)
+{
+    const armnn::TensorShape inputShape{ 1, 8, 1 };
+    const std::vector<float> outputData = { 0.0964599f, 0.26220518f, 0.0964599f, 0.0964599f,
+                                            0.15903549f, 0.0964599f, 0.0964599f, 0.0964599f };
+
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 3>(workloadFactory, memoryManager, beta, inputShape, outputData);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Simple4dSoftmaxTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta)
+{
+    const armnn::TensorShape inputShape{ 1, 8, 1, 1 };
+    const std::vector<float> outputData = { 0.0964599f, 0.26220518f, 0.0964599f, 0.0964599f,
+                                            0.15903549f, 0.0964599f, 0.0964599f, 0.0964599f };
+
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 4>(workloadFactory, memoryManager, beta, inputShape, outputData);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> CompareSoftmaxTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::IWorkloadFactory& refWorkloadFactory,
+        float beta)
+{
 
     const int batchSize = 20;
     const int channels = 30;
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index 23b4dc4..58b9ba7 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -34,11 +34,6 @@
 // Activation
 ARMNN_AUTO_TEST_CASE(ConstantLinearActivation, ConstantLinearActivationTest)
 
-ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1, SimpleSoftmaxTest, 1.0f)
-ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2, SimpleSoftmaxTest, 2.0f)
-ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1Uint8, SimpleSoftmaxUint8Test, 1.0f)
-ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f)
-
 ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest)
 ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest)
 
@@ -377,6 +372,22 @@
 ARMNN_AUTO_TEST_CASE(GreaterBroadcast1ElementUint8, GreaterBroadcast1ElementUint8Test)
 ARMNN_AUTO_TEST_CASE(GreaterBroadcast1DVectorUint8, GreaterBroadcast1DVectorUint8Test)
 
+// Softmax
+ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1, SimpleSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2, SimpleSoftmaxTest, 2.0f)
+ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1Uint8, SimpleSoftmaxUint8Test, 1.0f)
+ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f)
+
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmax, Simple3dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmaxUint8, Simple3dSoftmaxUint8Test, 1.0f)
+
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmax, Simple4dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmaxUint8, Simple4dSoftmaxUint8Test, 1.0f)
+
+ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxBeta1WithReference, CompareSoftmaxTest, 1.0f)
+ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxBeta2WithReference, CompareSoftmaxTest, 2.0f)
+ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxUint8, CompareSoftmaxUint8Test, 1.0f)
+
 // Space To Batch Nd
 ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleFloat32, SpaceToBatchNdSimpleFloat32Test)
 ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiChannelsFloat32, SpaceToBatchNdMultiChannelsFloat32Test)
@@ -445,10 +456,6 @@
                                  armnn::NormalizationAlgorithmChannel::Across,
                                  armnn::NormalizationAlgorithmMethod::LocalBrightness)
 
-ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxBeta1WithReference, CompareSoftmaxTest, 1.0f)
-ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxBeta2WithReference, CompareSoftmaxTest, 2.0f)
-ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareSoftmaxUint8, CompareSoftmaxUint8Test, 1.0f)
-
 ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareMaxPooling2dWithRef, ComparePooling2dTest, armnn::PoolingAlgorithm::Max)
 
 ARMNN_COMPARE_REF_AUTO_TEST_CASE(CompareAveragePooling2dWithRef, ComparePooling2dTest, armnn::PoolingAlgorithm::Average)
diff --git a/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp
index bb56802..c78ab03 100644
--- a/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp
@@ -4,11 +4,12 @@
 //
 
 #include "ClSoftmaxFloatWorkload.hpp"
+#include "ClWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeUtils.hpp>
 #include <cl/ClTensorHandle.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 
-#include "ClWorkloadUtils.hpp"
-
 namespace armnn
 {
 
@@ -21,7 +22,9 @@
 
     arm_compute::ICLTensor& input  = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-    m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta);
+
+    unsigned int aclAxis = ComputeSoftmaxAclAxis(info.m_InputTensorInfos[0]);
+    m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
 }
 
 void ClSoftmaxFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp b/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp
index c386e38..086f375 100644
--- a/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp
+++ b/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp
@@ -4,11 +4,12 @@
 //
 
 #include "ClSoftmaxUint8Workload.hpp"
+#include "ClWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeUtils.hpp>
 #include <cl/ClTensorHandle.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 
-#include "ClWorkloadUtils.hpp"
-
 namespace armnn
 {
 
@@ -30,7 +31,8 @@
             "Invalid quantization for output. Only scale = 1.0f / 256.0f and offset = 0 supported");
     }
 
-    m_SoftmaxLayer.configure(&input, &output, descriptor.m_Parameters.m_Beta);
+    unsigned int aclAxis = ComputeSoftmaxAclAxis(info.m_InputTensorInfos[0]);
+    m_SoftmaxLayer.configure(&input, &output, descriptor.m_Parameters.m_Beta, aclAxis);
 }
 
 void ClSoftmaxUint8Workload::Execute() const
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 01773db..b34e2dd 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -275,14 +275,21 @@
 // Activation
 ARMNN_AUTO_TEST_CASE(ConstantLinearActivation, ConstantLinearActivationTest)
 
+// ReLu
+ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest)
+ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest)
+
+// Softmax
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1, SimpleSoftmaxTest, 1.0f)
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2, SimpleSoftmaxTest, 2.0f)
-
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1Uint8, SimpleSoftmaxUint8Test, 1.0f)
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f)
 
-ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest)
-ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest)
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmaxBeta1, Simple3dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmaxBeta1Uint8, Simple3dSoftmaxUint8Test, 1.0f)
+
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmaxBeta1, Simple4dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmaxBeta1Uint8, Simple4dSoftmaxUint8Test, 1.0f)
 
 // Splitter
 ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest)
diff --git a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
index d9c78bb..afc6135 100644
--- a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
@@ -7,6 +7,7 @@
 
 #include "NeonWorkloadUtils.hpp"
 
+#include <aclCommon/ArmComputeUtils.hpp>
 #include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
 
 namespace armnn
@@ -22,8 +23,9 @@
     arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
+    unsigned int aclAxis = ComputeSoftmaxAclAxis(info.m_InputTensorInfos[0]);
     auto layer = std::make_unique<arm_compute::NESoftmaxLayer>(memoryManager);
-    layer->configure(&input, &output, m_Data.m_Parameters.m_Beta);
+    layer->configure(&input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
     m_SoftmaxLayer.reset(layer.release());
 }
 
diff --git a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
index f780589..7f295d6 100644
--- a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
@@ -4,9 +4,10 @@
 //
 
 #include "NeonSoftmaxUint8Workload.hpp"
-
 #include "NeonWorkloadUtils.hpp"
 
+#include <aclCommon/ArmComputeUtils.hpp>
+
 #include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
 
 namespace armnn
@@ -29,9 +30,10 @@
         throw InvalidArgumentException(
             "Invalid quantization for output. Only scale = 1.0f / 256.0f and offset = 0 supported");
     }
+    unsigned int aclAxis = ComputeSoftmaxAclAxis(info.m_InputTensorInfos[0]);
 
     auto layer = std::make_unique<arm_compute::NESoftmaxLayer>(memoryManager);
-    layer->configure(&input, &output, descriptor.m_Parameters.m_Beta);
+    layer->configure(&input, &output, descriptor.m_Parameters.m_Beta, aclAxis);
     m_SoftmaxLayer.reset(layer.release());
 }
 
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 4be917f..13ea82d 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -176,11 +176,19 @@
 ARMNN_AUTO_TEST_CASE(SimpleNormalizationWithin, SimpleNormalizationWithinTest)
 ARMNN_AUTO_TEST_CASE(SimpleNormalizationAcrossNhwc, SimpleNormalizationAcrossNhwcTest)
 
+// Softmax
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1, SimpleSoftmaxTest, 1.0f)
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2, SimpleSoftmaxTest, 2.0f)
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta1Uint8, SimpleSoftmaxUint8Test, 1.0f)
 ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f)
 
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmax, Simple3dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple3dSoftmaxUint8, Simple3dSoftmaxUint8Test, 1.0f)
+
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmax, Simple4dSoftmaxTest, 1.0f)
+ARMNN_AUTO_TEST_CASE(Simple4dSoftmaxUint8, Simple4dSoftmaxUint8Test, 1.0f)
+
+// Sigmoid
 ARMNN_AUTO_TEST_CASE(SimpleSigmoid, SimpleSigmoidTest)
 ARMNN_AUTO_TEST_CASE(SimpleSigmoidUint8, SimpleSigmoidUint8Test)
 
diff --git a/tests/CaffeSqueezeNet1_0-Armnn/CaffeSqueezeNet1_0-Armnn.cpp b/tests/CaffeSqueezeNet1_0-Armnn/CaffeSqueezeNet1_0-Armnn.cpp
index 5eaf2ef..f4c16fe 100644
--- a/tests/CaffeSqueezeNet1_0-Armnn/CaffeSqueezeNet1_0-Armnn.cpp
+++ b/tests/CaffeSqueezeNet1_0-Armnn/CaffeSqueezeNet1_0-Armnn.cpp
@@ -8,8 +8,13 @@
 
 int main(int argc, char* argv[])
 {
-    return armnn::test::ClassifierInferenceTestMain<CaffePreprocessor, armnnCaffeParser::ICaffeParser>(
+    using DataType = float;
+    using DatabaseType = CaffePreprocessor;
+    using ParserType = armnnCaffeParser::ICaffeParser;
+    using ModelType = InferenceModel<ParserType, DataType>;
+
+    return armnn::test::ClassifierInferenceTestMain<DatabaseType, ParserType>(
         argc, argv, "squeezenet.caffemodel", true,
-        "data", "output", { 0 },
-        [](const char* dataDir) { return CaffePreprocessor(dataDir); });
+        "input", "prob", { 0 },
+        [](const char* dataDir, const ModelType &) { return CaffePreprocessor(dataDir); });
 }