COMPMID-1975: Update depthwise convolution.

Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/898
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 840086f..10d110f 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,107 +35,60 @@
 
 #pragma once
 
+using namespace neon_convolution_kernels;
+
 namespace depthwise
 {
-// Partial specialisation for FP32 to FP32
-template <int OutputTileRows, int OutputTileCols,
-          int KernelRows, int KernelCols,
-          int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
-{
-  typedef DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float
-  > DWC;
 
-  template <
-    bool Specialize=false,  // Specialize (or not) the method
-    int InPadTop=0,         // If specialized, top padding
-    int InPadLeft=0,        // If specialized, left padding
-    int InPadBottom=0,      // If specialized, bottom padding
-    int InPadRight=0,       // If specialized, right padding
-    int OutPadBottom=0,     // If specialized, bottom output padding
-    int OutPadRight=0       // If specialized, bottom right padding
-  >
-  static void process_tile(
-    const int n_channels,
-    const float* const weights,
-    const int weight_row_stride,
-    const int weight_col_stride,
-    const float* const inptr,
-    const int in_row_stride,
-    const int in_col_stride,
-    float* const outptr,
-    const int out_row_stride,
-    const int out_col_stride,
-    const int in_pad_top=0,
-    const int in_pad_left=0,
-    const int in_pad_bottom=0,
-    const int in_pad_right=0,
-    const int out_pad_bottom=0,
-    const int out_pad_right=0,
-    const int input_offset=0,
-    const int weights_offset=0
-  );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
 template <
-  bool Specialize,
-  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-  int OutPadBottom, int OutPadRight
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
-  const int n_channels,
-  const float *__restrict__ const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float *__restrict__ const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float *__restrict__ const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int _in_pad_top,
-  const int _in_pad_left,
-  const int _in_pad_bottom,
-  const int _in_pad_right,
-  const int _out_pad_bottom,
-  const int _out_pad_right,
-  const int _input_offset,
-  const int _weights_offset
+DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  ActivationFunction activation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::execute_tile(
+  int n_channels,
+  const void *weights_biases_ptr,
+  const float *input,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  float *output,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
-
-  // Extract parameters
-  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
   // Instantiate pointers
-  const float* __restrict__ inptr_base = inptr;
-  const float* __restrict__ wptr_base = weights;
-  float* __restrict__ outptr_base = outptr;
+  const float* __restrict__ inptr_base = input;
+  float* __restrict__ outptr_base = output;
+  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
 
   // Perform the depthwise convolution
   int channels_remaining = n_channels;
@@ -143,74 +96,74 @@
   for (; channels_remaining >= 4; channels_remaining -= 4)
   {
     // Load input tile
-    float32x4_t u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = vdupq_n_f32(0.0f);
-        }
-        else
-        {
-          u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
       }
     }
     inptr_base += 4;
 
     // Load weights tile
-    float32x4_t w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float32x4_t vbias = vld1q_f32(params);
+    params += 4;
+
+    float32x4_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+        w[i][j] = vld1q_f32(params);
+        params += 4;
       }
     }
-    wptr_base += 4;
 
     // Perform the convolution
-    float32x4_t v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float32x4_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
+        v[out_i][out_j] = vbias;
+
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
-            const int j = base_j + in_j;
-            if (in_i == 0 && in_j == 0)
-            {
-              // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
-            }
-            else
-            {
-              // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-            }
+            const unsigned int j = base_j + in_j;
+
+            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
       }
@@ -221,68 +174,70 @@
   for (; channels_remaining; channels_remaining--)
   {
     // Load input tile
-    float u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<float>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = *(inptr_row + j*in_col_stride);
       }
     }
     inptr_base++;
 
     // Load weights tile
-    float w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float bias = *(params++);
+    float w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
+        w[i][j] = *(params++);
       }
     }
-    wptr_base++;
 
     // Perform the convolution
-    float v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
         // Clear the accumulator
-        v[out_i][out_j] = static_cast<float>(0);
+        v[out_i][out_j] = bias;
 
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
             const int j = base_j + in_j;
             v[out_i][out_j] += w[in_i][in_j] * u[i][j];
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         *(outptr_row + j*out_col_stride) = v[i][j];
       }