[COMPMID-386] Github: Support SoftmaxLayer on different number of dimensions?

Change-Id: I7422b977538ff29930a90f078badc2edee78af93
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/146638
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index d72547e..cb04182 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -275,6 +275,38 @@
     return output_shape;
 }
 
+inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis = 1)
+{
+    // The output shape will be a 2D version of the input. For instance:
+    // - [x,y,z] and axis 1 will return [x, y*z]
+    // - [x,y,z,w] and axis 2 will return [x*y, w*z]
+    // - [x,y,z,w] and axis 3 will return [x*y*z, w]
+    TensorShape shape2D = input->tensor_shape();
+
+    if(axis < input->num_dimensions())
+    {
+        // Collapse from axis onward (this changes the shape)
+        shape2D.collapse_from(axis);
+
+        // Collapse the rest (collapse is inclusive)
+        shape2D.collapse(shape2D.num_dimensions() - 1);
+    }
+    else
+    {
+        // Collapse everything
+        shape2D.collapse(shape2D.num_dimensions());
+    }
+
+    if(axis == 0)
+    {
+        // If axis is zero the first dim should be one. Since
+        // collapse is an inclusive operation we need to shift
+        shape2D.shift_right(1);
+    }
+
+    return shape2D;
+}
+
 inline TensorShape compute_interleave_custom_shape(const TensorShape &input, const int x_interleave, const int y_interleave)
 {
     TensorShape output_shape{ input };
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 90c99d6..8d2c03f 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -58,16 +58,22 @@
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
+     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                    dimensions together. For instance, given a [4x4x4x4] image,
+     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
      * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
+     * @param[in] beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
+     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                    dimensions together. For instance, given a [4x4x4x4] image,
+     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 1);
 
     // Inherited methods overridden:
     void run() override;
@@ -82,19 +88,22 @@
      *
      * @param[in] input  Original source tensor.
      * @param[in] output Original destination tensor.
+     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                    dimensions together. For instance, given a [4x4x4x4] image,
+     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
      */
-    void configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output);
+    void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis);
 
     CLMemoryGroup                  _memory_group;
     CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
     CLLogits1DNormKernel           _norm_kernel;
-    CLFlattenLayerKernel           _flatten_kernel;
+    std::unique_ptr<ICLKernel>     _flatten_kernel_ptr;
     CLReshapeLayerKernel           _reshape_kernel;
     CLTensor                       _max;
     CLTensor                       _sum;
     CLTensor                       _tmp;
-    CLTensor                       _input_flat;
-    CLTensor                       _output_flat;
+    CLTensor                       _input_flattened;
+    CLTensor                       _output_flattened;
     bool                           _needs_flattening;
 };
 }
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
index 1011c9a..f6c6edb 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
@@ -52,9 +52,14 @@
      *
      * @param[in]  input  Source tensor. Data types supported: F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported.
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported
+     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                    dimensions together. For instance, given a [4x4x4x4] image,
+     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     *
+     * @note The value of @p axis must be always 1 for GLES
      */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f);
+    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t axis = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 61f4600..3f5ec8e 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -56,17 +56,27 @@
      *                       last value of each row to the nearest multiple.
      * @param[out]    output Destination tensor. Data types supported: same as @p input.
      * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                       dimensions together. For instance, given a [4x4x4x4] image,
+     *                       when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     *
+     * @note The value of @p axis must be always 1 for NEON
      */
-    void configure(ITensor *input, ITensor *output, float beta = 1.0f);
+    void configure(ITensor *input, ITensor *output, float beta = 1.0f, size_t axis = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent.
+     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
+     *                   dimensions together. For instance, given a [4x4x4x4] image,
+     *                   when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     *
+     * @note The value of @p axis must be always 1 for NEON
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 3a7d6c7..d671846 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -36,34 +36,48 @@
 namespace arm_compute
 {
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flat(), _output_flat(),
+    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
       _needs_flattening(false)
 {
 }
 
-void CLSoftmaxLayer::configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output)
+void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
 {
     // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input->info());
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
 
     // Initialize the flat input
-    _input_flat.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
-    // Configure the flatten_kernel
-    _flatten_kernel.configure(input, &_input_flat);
+    // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
+    // If flattening on the third axes, we use CLFlattenKernel.
+    // In all other cases we have to use CLReshapeKernel
+    if(axis != 3)
+    {
+        auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
+        reshape_kernel_ptr->configure(input, &_input_flattened);
+        _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
+    }
+    else
+    {
+        auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
+        flatten_kernel_ptr->configure(input, &_input_flattened);
+        _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
+    }
 
     // We need to init the output tensor here. Indeed, the reshape kernel expects
     // both tensors to be already initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
 
-    _needs_flattening = input->info()->num_dimensions() > 2;
+    // We don't need flattening only in the case the input is 2D and axis is 1
+    _needs_flattening = axis != 1;
 
     // If we are dealing with a 4D tensor, we will:
     // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
@@ -71,16 +85,16 @@
     // - Reshape the flattened output into the real output
     if(_needs_flattening)
     {
-        // Add to the memory manager _input_flat
-        _memory_group.manage(&_input_flat);
+        // Add to the memory manager _input_flattened
+        _memory_group.manage(&_input_flattened);
 
-        // Cofigure  _flatten_kernel and _input_flat
-        configure_flatten_kernel(input, output);
+        // Cofigure  _flatten_kernel and _input_flattened
+        configure_reshape_input_kernel(input, output, axis);
     }
 
     // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
     // or it is the original input case (2D case)
-    const ICLTensor *input_2D = (_needs_flattening ? &_input_flat : input);
+    const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
 
     // Create intermediate tensors shapes
     TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
@@ -106,18 +120,18 @@
 
     if(_needs_flattening)
     {
-        // Add to the memory manager _output_flat
-        _memory_group.manage(&_output_flat);
+        // Add to the memory manager _output_flattened
+        _memory_group.manage(&_output_flattened);
 
         // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(&_tmp, &_sum, &_output_flat, beta);
+        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
 
         // Reshape the flat output into a the requested (4D) output
-        _reshape_kernel.configure(&_output_flat, output);
+        _reshape_kernel.configure(&_output_flattened, output);
 
         // Allocate the intermediate flat tensors
-        _input_flat.allocator()->allocate();
-        _output_flat.allocator()->allocate();
+        _input_flattened.allocator()->allocate();
+        _output_flattened.allocator()->allocate();
     }
     else
     {
@@ -131,10 +145,11 @@
     _sum.allocator()->allocate();
 }
 
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_UNUSED(beta);
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -145,26 +160,42 @@
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
-    const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input);
-    TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+    const bool needs_flattening = (axis != 1);
 
-    if(input->num_dimensions() > 2) // needs flattening
+    if(needs_flattening)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+        if(axis != 3)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+        }
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
 
+    if(needs_flattening)
+    {
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+    }
+
     return Status{};
 }
 
 void CLSoftmaxLayer::run()
 {
     _memory_group.acquire();
+
     if(_needs_flattening)
     {
-        CLScheduler::get().enqueue(_flatten_kernel, false);
+        CLScheduler::get().enqueue(*_flatten_kernel_ptr, false);
     }
 
     CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 0c8769b..dad42cd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -34,12 +34,13 @@
 {
 }
 
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis)
 {
-    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(beta, axis);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
+    ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES");
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 3a73f1e..9be9e68 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -36,9 +36,10 @@
 {
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(axis);
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
@@ -58,8 +59,10 @@
     _tmp.allocator()->allocate();
 }
 
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
+
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 7dab626..c9ef35d 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -134,23 +134,26 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                           framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                   framework::dataset::make("Axis", { 1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                               framework::dataset::make("DataType", DataType::F16)),
-                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                       framework::dataset::make("Axis", { 1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                            framework::dataset::make("DataType", DataType::F16)),
-                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
+                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                    framework::dataset::make("Axis", { 1, 2, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -158,23 +161,26 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                            framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                    framework::dataset::make("Axis", { 1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)),
-                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                        framework::dataset::make("Axis", { 1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                             framework::dataset::make("DataType", DataType::F32)),
-                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                     framework::dataset::make("Axis", { 1, 2, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -187,26 +193,29 @@
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                               combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.f }))))
+                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.f }))),
+                                                                                                               framework::dataset::make("Axis", { 1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
+                                                                                                                   framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f }))),
+                                                                                                                framework::dataset::make("Axis", { 1, 2, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
index abc277a..3b55717 100644
--- a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
@@ -86,16 +86,18 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                     framework::dataset::make("Beta", 1.0f)))
+                                                                                                                     framework::dataset::make("Beta", 1.0f)),
+                                                                                                                     framework::dataset::make("Axis", 1)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
+FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", 1.0f)))
+                                                                                                                   framework::dataset::make("Beta", 1.0f)),
+                                                                                                                   framework::dataset::make("Axis", 1)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
@@ -103,16 +105,18 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                          framework::dataset::make("Beta", 1.0f)))
+                                                                                                                  framework::dataset::make("Beta", 1.0f)),
+                                                                                                          framework::dataset::make("Axis", 1)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)),
-                                                                                                        framework::dataset::make("Beta", 1.0f)))
+FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("Beta", 1.0f)),
+                                                                                                        framework::dataset::make("Axis", 1)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index a5d6344..21c77e7 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -118,16 +118,18 @@
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
-                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.0f })))
+                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                         framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                               framework::dataset::make("DataType", DataType::F16)),
-                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                       framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
@@ -136,16 +138,18 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.0f })))
+                                                                                                                  framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                          framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)),
-                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                        framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -158,22 +162,25 @@
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                               combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.f }))),
+                                                                                                               framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
+                                                                                                                   framework::dataset::make("Axis", { 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index 99c0710..e39ee74 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -47,12 +47,12 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta)
+    void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta, size_t axis)
     {
         _quantization_info = quantization_info;
 
-        _target    = compute_target(shape, data_type, quantization_info, beta);
-        _reference = compute_reference(shape, data_type, quantization_info, beta);
+        _target    = compute_target(shape, data_type, quantization_info, beta, axis);
+        _reference = compute_reference(shape, data_type, quantization_info, beta, axis);
     }
 
 protected:
@@ -72,7 +72,7 @@
     }
 
     TensorType compute_target(const TensorShape &shape, DataType data_type,
-                              QuantizationInfo quantization_info, float beta)
+                              QuantizationInfo quantization_info, float beta, size_t axis)
     {
         // Create tensors
         TensorType src = create_tensor<TensorType>(shape, data_type, 1, quantization_info);
@@ -80,7 +80,7 @@
 
         // Create and configure function
         FunctionType smx_layer;
-        smx_layer.configure(&src, &dst, beta);
+        smx_layer.configure(&src, &dst, beta, axis);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -102,7 +102,7 @@
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type,
-                                      QuantizationInfo quantization_info, float beta)
+                                      QuantizationInfo quantization_info, float beta, size_t axis)
     {
         // Create reference
         SimpleTensor<T> src{ shape, data_type, 1, quantization_info };
@@ -110,7 +110,7 @@
         // Fill reference
         fill(src);
 
-        return reference::softmax_layer<T>(src, beta);
+        return reference::softmax_layer<T>(src, beta, axis);
     }
 
     TensorType       _target{};
@@ -123,12 +123,13 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, float beta)
+    void setup(TensorShape shape, DataType data_type, float beta, size_t axis)
     {
         SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
                                                                                           data_type,
                                                                                           QuantizationInfo(),
-                                                                                          beta);
+                                                                                          beta,
+                                                                                          axis);
     }
 };
 
@@ -137,12 +138,13 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta)
+    void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta, size_t axis)
     {
         SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
                                                                                           data_type,
                                                                                           quantization_info,
-                                                                                          beta);
+                                                                                          beta,
+                                                                                          axis);
     }
 };
 } // namespace validation
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 7f2c36e..f1b94c0 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -34,18 +34,26 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    const bool is_4D_input = (src.shape().num_dimensions() > 2);
+    // Compute reference. Lower dims are the collapsing of the first axis
+    // dimensions (i.e., the flattened dimension of each batch). The upper dims are
+    // instead the batches we want to normalize
 
-    // Compute reference. Lower dims are
-    // - the number of columns for the 2D case
-    // - the collapsing of the first three dimensions (i.e., the flattened dimension of each batch) in the 4D case
-    const int lower_dims = (is_4D_input ? src.shape()[2] * src.shape()[1] * src.shape()[0] : src.shape()[0]);
-    const int upper_dims = src.num_elements() / lower_dims;
+    int lower_dims = 1;
+    for(size_t i = 0; i < axis; i++)
+    {
+        lower_dims *= src.shape()[i];
+    }
+
+    int upper_dims = 1;
+    for(size_t i = axis; i < TensorShape::num_max_dimensions; i++)
+    {
+        upper_dims *= src.shape()[i];
+    }
 
     for(int r = 0; r < upper_dims; ++r)
     {
@@ -75,20 +83,20 @@
 }
 
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
     // Note: Output quantization info should always have scale = 1/256 and offset = 0
     const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis);
     SimpleTensor<T>     dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta);
-template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta);
-template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta);
+template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, size_t axis);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, size_t axis);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, size_t axis);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index 21dca1e..d21ca2b 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -36,10 +36,10 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 } // namespace reference
 } // namespace validation
 } // namespace test