Parallelize CPU depthwise over batch if only 1 row

This patch also fixes a bug where the split dimension was wrong in
CpuDepthwiseConv2dAssemblyDispatch::run. It was set to DimY, which is
cols, but it should have been DimZ. This was rarely an issue in practice
because typically the number of cols are greater than the number of
threads anyway.

Relates to: ONCPUML-1443
Co-authored-by: Milos Puzovic <Milos.Puzovic@arm.com>

Change-Id: Ifed2fce22ddeb7cd77e6a6ae1083694427f91e04
Signed-off-by: Jonathan Deakin <jonathan.deakin@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11083
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 40ad09f..b1a4395 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2023 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -41,6 +41,10 @@
 
 @section S2_2_changelog Changelog
 
+v24.02 Public major release
+ - Performance optimizations:
+   - Parallelize @ref NEDepthwiseConvolutionLayer over batches if there is only 1 row
+
 v24.01 Public major release
  - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
   You should link only to the main `libarm_compute` library for core functionality.
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index 592ee72..95ece8c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,12 +173,30 @@
 
     const auto n_output_channels = args.input_channels * args.channel_multiplier;
 
-    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    // By default we parallelize over the rows, but if there's only 1 row, we
+    // try to parallize over batches
+    auto thread_id_for_rows = thread_id;
+    auto n_threads_for_rows = n_threads;
+    auto thread_id_for_batches = 0;
+    auto n_threads_for_batches = 1;
+    if (args.output_rows == 1) {
+      thread_id_for_rows = 0;
+      n_threads_for_rows = 1;
+      thread_id_for_batches = thread_id;
+      n_threads_for_batches = n_threads;
+    }
+
+    // Progress the pointers for the first batch.
+    input_tensor.base += ld_input_batch*thread_id_for_batches;
+    output_tensor.base += ld_output_batch*thread_id_for_batches;
+    for (unsigned int batch = thread_id_for_batches;
+          batch < args.n_batches;
+          batch += n_threads_for_batches)
     {
       // Iterate over rows of the output tensor; we stripe over the tiles.
-      for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
            start_output_i < args.output_rows;
-           start_output_i += n_threads * m_strat->get_output_rows())
+           start_output_i += n_threads_for_rows * m_strat->get_output_rows())
       {
         // Determine what (if any padding) is required on the top/bottom of
         // this row of the convolution.
@@ -264,8 +282,8 @@
       }
 
       // Progress the pointers for the next batch.
-      input_tensor.base += ld_input_batch;
-      output_tensor.base += ld_output_batch;
+      input_tensor.base += ld_input_batch*n_threads_for_batches;
+      output_tensor.base += ld_output_batch*n_threads_for_batches;
     }
   }
 
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index 8d3741d..38092ad 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,11 @@
 
     prepare(tensors);
 
-    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
+    // Split over rows (z) if there's more than 1, otherwise batches (w). This logic
+    // corresponds to the threading strategy in DepthFirstDriver::execute_internal
+    auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) == 1 ? Window::DimZ : Window::DimW;
+
+    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors);
 }
 
 void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)