Depthwise channel pre-multiplication

Resolves: COMPMID-6337
Change-Id: Ie9097b3f56e8071426c621386a5988bd7f7e8ef2
Signed-off-by: Michael Tyler <michael.tyler@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9852
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index 567eab1..c3daaf0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -153,7 +153,7 @@
   {
     return interleaves::PackingArguments(
       m_kernel_rows, m_kernel_cols, sizeof(TWeight),
-      false, sizeof(TAccum),  // Don't pack the bias
+      false, sizeof(TAccum), true,  // Don't pack the bias
       m_vl_type, sizeof(TAccum), 1,  // Accumulator depth of 1 TODO
       [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
       { return this->get_kernel_packing_point(idx, x, y); }
@@ -276,7 +276,7 @@
     depthwise_depthfirst::stash_bias(this->m_os, biases);
   }
 
-  size_t get_working_size(unsigned int n_threads, unsigned int) const override
+  size_t get_working_size(unsigned int n_threads) const override
   {
     return this->get_working_size_per_thread() * n_threads;
   }