COMPMID-1975: Update depthwise convolution.

Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/898
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 64f10b4..87ca4da 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,10 @@
 #define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__
 
 #include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-#include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */
@@ -60,23 +58,8 @@
      * @param[out] output           Destination tensor. Data type supported: Same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  data_layout      (Optional) Data layout of the input and weights tensor
      */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW);
-    /** Static method that checks if optimized execution is supported for the given parameters
-     *
-     * @param[in] input_shape      Input shape
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] dt               Data type of the input and weights
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] data_layout      (Optional) Data layout of the input and weights tensor
-     *
-     * @return True if the optimized kernels can be executed else false
-     */
-    static bool is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW);
-    /** Generates the convolver object */
-    void generate_convolver();
-
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
      *
      * @note Supported data layouts: NCHW and NHWC
@@ -96,40 +79,13 @@
     BorderSize border_size() const override;
 
 private:
-    void configure_generic();
-    void configure_optimized();
-
-    void run_generic(const Window &window, const ThreadInfo &info);
-    void run_optimized(const Window &window, const ThreadInfo &info);
-    /** Creates an optimized backend convolver object
-     *
-     * @note Convolver of strides 1,2 and convolution size of 3 is currently supported
-     *
-     * @param[in] conv_info     Padding and stride information to use for the convolution
-     * @param[in] w             Weights tensor
-     * @param[in] in            Input tensor
-     * @param[in] out           Output tensor
-     * @param[in] setup_strides (Optional) Boolean to enable setting the strides of the tensors
-     *                           in the convolver in case of padding. Defaults to false
-     *
-     * @return  A convolver object or nullptr if the configuration is not supported
-     */
-    std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver_object(PadStrideInfo  conv_info,
-                                                                              const ITensor *w,
-                                                                              const ITensor *in,
-                                                                              ITensor       *out,
-                                                                              bool           setup_strides = false);
-
-private:
-    BorderSize                                        _border_size;
-    const ITensor                                    *_input;
-    ITensor                                          *_output;
-    const ITensor                                    *_weights;
-    PadStrideInfo                                     _conv_info;
-    std::unique_ptr<depthwise::IDepthwiseConvolution> _convolver;
-    unsigned int                                      _num_elems_written_per_iteration;
-    bool                                              _run_optimized;
-    unsigned int                                      _depth_multiplier;
+    BorderSize     _border_size;
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_weights;
+    PadStrideInfo  _conv_info;
+    unsigned int   _num_elems_written_per_iteration;
+    unsigned int   _depth_multiplier;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
new file mode 100644
index 0000000..def395c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__
+#define __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** This class is a wrapper for the depthwise convolution assembly kernels.  */
+class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
+    }
+
+    /** Default constructor */
+    NEDepthwiseConvolutionAssemblyKernelWrapper()
+        : _kernel(nullptr)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+    /** Default Move Constructor. */
+    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in] kernel Pointer to an assembly kernel implementation.
+     */
+    void configure(depthwise::IDepthwiseConvolution *kernel)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
+        _kernel = kernel;
+        Window win;
+        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
+        INEKernel::configure(win);
+    }
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+        auto first = window.x().start();
+        auto last  = window.x().end();
+        _kernel->run(first, last, info.thread_id);
+    }
+
+private:
+    depthwise::IDepthwiseConvolution *_kernel;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
new file mode 100644
index 0000000..091b165
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace neon_convolution_kernels
+{
+
+enum class ActivationFunction
+{
+  None,
+  ReLU,
+  ReLU6,
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
new file mode 100644
index 0000000..33f77d7
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+// Utilities for copying tensor tiles and adding/removing padding.
+namespace padding
+{
+
+/* Copy a tile and apply padding to the output copy.
+ */
+template <typename T>
+void copy_and_pad_tile(
+  unsigned int tile_rows,
+  unsigned int tile_cols,
+  unsigned int n_channels,
+  const T *inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  T* outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride,
+  unsigned int pad_top,
+  unsigned int pad_left,
+  unsigned int pad_bottom,
+  unsigned int pad_right,
+  T pad_value=static_cast<T>(0)
+);
+
+/** Copy a tile and remove padding elements in the output.
+ */
+template <unsigned int TileRows, unsigned int TileCols>
+class CopyCropped
+{
+  public:
+    static void execute(
+      size_t size,  // Amount of data to copy
+      const void *inptr,
+      size_t in_row_stride,
+      size_t in_col_stride,
+      void *outptr,
+      size_t out_row_stride,
+      size_t out_col_stride,
+      unsigned int pad_top,
+      unsigned int pad_left,
+      unsigned int pad_bottom,
+      unsigned int pad_right
+    );
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
new file mode 100644
index 0000000..6029cb6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace qasymm8
+{
+
+struct QAsymm8Params
+{
+  uint8_t quantize(float value) const;
+  float dequantize(uint8_t value) const;
+
+  uint8_t offset;
+  float scale;
+};
+
+struct QAsymm8RescaleParams
+{
+  static QAsymm8RescaleParams make_rescale_params(
+    const QAsymm8Params& weight_quant,
+    const QAsymm8Params& input_quant,
+    const QAsymm8Params& output_quant
+  );
+
+  QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
+
+  const int32_t shift, multiplier;
+  const float rescale;
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
index 6567eeb..ad0a677 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,18 @@
   {
   }
 
+  inline int index(const int n, const int i, const int j, const int c) const
+  {
+    if (this->ordering == NHWC)
+    {
+      return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c;
+    }
+    else  // NCHW
+    {
+      return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j;
+    }
+  }
+
   inline int size() const
   {
     return n_batches * n_rows * n_cols * n_channels;
@@ -94,6 +106,18 @@
   {
   }
 
+  inline int index(int oc, int i, int j, int ic) const
+  {
+    if (this->ordering == HWIO)
+    {
+      return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc;
+    }
+    else  // OIHW
+    {
+      return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j;
+    }
+  }
+
   inline int size(void) const
   {
     return n_output_channels * n_rows * n_cols * n_input_channels;
@@ -127,7 +151,16 @@
       return shape.size() * sizeof(T);
     }
 
-    inline T& element(int, int, int, int) const;
+    /* Extract an element of the tensor.
+     *
+     * If the shape is a Tensor4DShape then the index is given as batch, row,
+     * column and channel.  If the shape is a KernelShape then the index is
+     * given as output channel, row, column and input channel.
+     */
+    inline T& element(const int a, const int b, const int c, const int d) const
+    {
+      return _data[shape.index(a, b, c, d)];
+    }
 
     inline void Clear() {
       Fill(static_cast<T>(0));
@@ -143,35 +176,3 @@
   private:
     T* const _data;
 };
-
-
-template <>
-inline float& Tensor4D<Tensor4DShape, float>::element(int n, int i, int j, int c) const
-{
-  int index;
-  if (shape.ordering == NHWC)
-  {
-    index = ((n*shape.n_rows + i)*shape.n_cols + j)*shape.n_channels + c;
-  }
-  else  // NCHW
-  {
-    index = ((n*shape.n_channels + c)*shape.n_rows + i)*shape.n_cols + j;
-  }
-  return _data[index];
-}
-
-
-template <>
-inline float& Tensor4D<KernelShape, float>::element(int oc, int i, int j, int ic) const
-{
-  int index;
-  if (shape.ordering == HWIO)
-  {
-    index = ((i*shape.n_cols + j)*shape.n_input_channels + ic)*shape.n_output_channels + oc;
-  }
-  else  // OIHW
-  {
-    index = ((oc*shape.n_input_channels + ic)*shape.n_rows + i)*shape.n_cols + j;
-  }
-  return _data[index];
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 6d9cb18..45e8da0 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -24,42 +24,84 @@
 
 #pragma once
 
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
 namespace depthwise
 {
 
+namespace nck = neon_convolution_kernels;
+
 class IDepthwiseConvolution
 {
   public:
     virtual ~IDepthwiseConvolution() = default;
-    virtual int output_size(const int dim_size, const bool padding_same) const = 0;
+
     virtual int output_size(
       int dim_size,
       unsigned int padding_before,
       unsigned int padding_after
     ) const = 0;
 
+    /* Set input tensor and stride. */
+    virtual void set_input(const void *inptr) = 0;
+    virtual void set_input(const void *inptr, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Set output tensor and stride. */
+    virtual void set_output(void *outptr) = 0;
+    virtual void set_output(void *outptr, int column_stride) = 0;
+    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
+    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Weights and biases are re-ordered to improve memory access patterns. Use
+     * these methods to determine the size of the re-pack buffer and to set the
+     * address (and implicitly reorder the weights and biases into) the buffer.
+     */
+    virtual size_t get_packed_params_size(void) const = 0;
+    virtual void set_packed_params_buffer(void *) = 0;
+
+    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(
+      void *buffer,
+      const void* weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const = 0;
+
+    /* Working space is used to pad tensors on the fly. Before running any
+     * inference check the amount of space required, allocate and provide a
+     * pointer to the convolution engine.
+     */
+    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
+    virtual void set_working_space(void *) = 0;
+
     virtual unsigned int get_window(void) const = 0;
-    virtual void set_offsets(int input_offset, int weights_offset) = 0;
-    virtual void run(const unsigned int start, const unsigned int stop) = 0;
+    virtual void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) = 0;
 };
 
 template <
-  int OutputTileRows,
-  int OutputTileCols,
-  int KernelRows,
-  int KernelCols,
-  int StrideRows,
-  int StrideCols,
-  typename TIn,
-  typename TOut
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut,
+  typename Derived
 >
-class DepthwiseConvolution : public IDepthwiseConvolution
+class DepthwiseConvolutionBase : public IDepthwiseConvolution
 {
   public:
-    typedef TIn InputType;
-    typedef TOut OutputType;
-
     // Information about the specific convolution instance
+    using InputType = TIn;
+    using BiasType = TBias;
+    using OutputType = TOut;
     static constexpr int output_tile_rows = OutputTileRows;
     static constexpr int output_tile_cols = OutputTileCols;
     static constexpr int kernel_rows = KernelRows;
@@ -71,260 +113,84 @@
 
     /** Create a new depthwise convolution engine.
      *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
+     * @param[in] n_batches Number of batches tensors.
+     * @param[in] n_input_rows Number of rows in input tensor.
+     * @param[in] n_input_cols Number of columns in input tensor.
+     * @param[in] n_channels Number of channels in input and output tensors.
      */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
-      weights, input, output, 0 /* column stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
+    DepthwiseConvolutionBase(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
       unsigned int padding_top,
       unsigned int padding_left,
       unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      padding_top, padding_left, padding_bottom, padding_right,
-      weights, input, output, 0 /* column stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine with a specified column stride.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  col_stride Stride between columns of the weights, inputs and output tensors.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      const int col_stride
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
-      weights, input, output,
-      col_stride, 0,    /* Weight row stride = default */
-      col_stride, 0, 0, /* Input row stride, batch stride = default */
-      col_stride, 0, 0  /* Output row stride, batch stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine with a specified column stride.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  col_stride Stride between columns of the weights, inputs and output tensors.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      const int col_stride
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      padding_top, padding_left, padding_bottom, padding_right,
-      weights, input, output,
-      col_stride, 0,    /* Weight row stride = default */
-      col_stride, 0, 0, /* Input row stride, batch stride = default */
-      col_stride, 0, 0  /* Output row stride, batch stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
-     * @param[in]  weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
-     * @param[in]  input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
-     * @param[in]  output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      int weight_col_stride,
-      int weight_row_stride,
-      int input_col_stride,
-      int input_row_stride,
-      int input_batch_stride,
-      int output_col_stride,
-      int output_row_stride,
-      int output_batch_stride
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
-     * @param[in]  weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
-     * @param[in]  input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
-     * @param[in]  output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      int weight_col_stride,
-      int weight_row_stride,
-      int input_col_stride,
-      int input_row_stride,
-      int input_batch_stride,
-      int output_col_stride,
-      int output_row_stride,
-      int output_batch_stride
+      unsigned int padding_right
     );
 
     // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolution(DepthwiseConvolution&) = delete;
-    DepthwiseConvolution operator=(DepthwiseConvolution&) = delete;
+    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
+    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
+
+    /* Set input tensor and stride. */
+    void set_input(const void *inptr) override;
+    void set_input(const void *inptr, int column_stride) override;
+    void set_input(const void *inptr, int row_stride, int column_stride) override;
+    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /* Set output tensor and stride. */
+    void set_output(void *outptr) override;
+    void set_output(void *outptr, int column_stride) override;
+    void set_output(void *outptr, int row_stride, int column_stride) override;
+    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
 
     /** Get the number of output rows/columns.
      *
      * @param[in] dim_size Number of elements in the dimension (rows/columns)
      * @param[in] same_padding True if the padding is SAME, otherwise false.
      */
-    static int get_output_size(int dim_size, bool padding_same);
     static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
+      int dim_size, unsigned int padding_before, unsigned int padding_after
     );
 
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
-     */
-    int output_size(int dim_size, bool padding_same) const override
-    {
-      return DepthwiseConvolution<
-        OutputTileRows,
-        OutputTileCols,
-        KernelRows,
-        KernelCols,
-        StrideRows,
-        StrideCols,
-        TIn, TOut
-      >::get_output_size(dim_size, padding_same);
-    }
-
     int output_size(
-        int dim_size,
-        unsigned int padding_before,
-        unsigned int padding_after
-    ) const override
-    {
-      return DepthwiseConvolution<
-        OutputTileRows,
-        OutputTileCols,
-        KernelRows,
-        KernelCols,
-        StrideRows,
-        StrideCols,
-        TIn, TOut
-      >::get_output_size(dim_size, padding_before, padding_after);
-    }
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    ) const override;
 
-    /** Sets quantization offsets
-     *
-     * @param[in] input_offset   Input offset
-     * @param[in] weights_offset Weights offset
+    /* Determine how much memory is required to store the packed weights and
+     * biases.
      */
-     void set_offsets(int input_offset, int weights_offset) override;
+    size_t get_packed_params_size(void) const override;
+
+    /* Set the buffer for the packed weights and biases, and perform the
+     * packing.
+     */
+    void set_packed_params_buffer(void *buffer) override;
+
+    void pack_params(const void *weights, const void *biases=nullptr) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      const void *biases=nullptr
+    ) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const override;
+
+    /** Query the amount of working space required.
+     * @param[in] The largest number of threads which will be used to execute
+     *            the kernel.
+     */
+    size_t get_working_space_size(unsigned int n_threads=1) const override;
+
+    /** Set the working space buffer.
+     */
+    void set_working_space(void *buffer) override;
 
     /** Get the window of work to be performed by an instance of the operator.
      */
@@ -336,122 +202,282 @@
      *
      * @param[in] start Start of the window of work to perform.
      * @param[in] stop End of the work to perform.
+     * @param[in] ID of the thread performing the work.
      */
-    void run(unsigned int start, unsigned int stop) override;
+    void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) override;
 
   protected:
+    /** Get the value to use to pad the tensor.
+     */
+    TIn _input_padding_value(void) const;
+
+    /** Implementation of the parameter packing.
+     */
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
     /** Process a tile-row of the tensors.
      */
-    static void process_tile_row(
+    void process_tile_row(
+      unsigned int threadid,
       int n_channels,
-      const TIn* const weights,
-      const int weight_row_stride,
-      const int weight_col_stride,
-      const TIn* const inptr,
-      int in_row_stride,
-      int in_col_stride,
-      TOut* const outptr,
-      int out_row_stride,
-      int out_col_stride,
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
       int row_pad_in_top,
       int row_pad_in_left,
       int row_pad_in_bottom,
       int row_pad_out_bottom,
       int n_tiles,
       int n_input_cols,
-      int n_output_cols,
-      int input_offset,
-      int weights_offset
+      int n_output_cols
     );
 
-    // Determine the maximum (and minimum) padding values which can be applied
-    // to tiles of the tensors involved in this class of convolution.
-    static constexpr int max_in_pad_top = (kernel_rows - 1) / 2;
-    static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2;
-
-    static constexpr int max_in_pad_left = (kernel_cols - 1) / 2;
-    static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2;
-
-    static constexpr int max_in_pad_bottom = inner_tile_rows;
-    static constexpr int max_in_pad_right = inner_tile_cols;
-    static constexpr int max_out_pad_bottom = output_tile_rows;
-    static constexpr int max_out_pad_right = output_tile_cols;
-
-    static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1;
-    static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1;
-    static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1;
-    static constexpr int n_in_pad_right_fns = max_in_pad_right + 1;
-    static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1;
-    static constexpr int n_out_pad_right_fns = max_out_pad_right + 1;
-
-    /** Pointer to a function which will process a tile.
+    /** Process a single tile of the tensor.
      *
-     * @param[in] n_channels Number of channels.
-     * @param[in] weights Pointer to Height x Width x Channels ordered weights.
-     * @param[in] inptr Pointer to the top-left unpadded value of the tile.
-     * @param[in] in_row_stride Stride between rows of the input tensor.
-     * @param[in] in_col_stride Stride between columns of the input tensor.
-     * @param[out] outptr Pointer to the top-left output value for the tile.
-     * @param[in] out_row_stride Stride between rows of the output tensor.
-     * @param[in] out_col_stride Stride between columns of the output tensor.
-     *
-     * The following parameters may be ignored if the function has been
-     * specialised for specific padding constraints.
-     *
-     * @param[in] _in_pad_top Padding to apply to top of input tile.
-     * @param[in] _in_pad_left Padding to apply to left of input tile.
-     * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
-     * @param[in] _in_pad_right Padding to apply to right of input tile.
-     * @param[in] _out_pad_bottom Null cells at bottom of output tile.
-     * @param[in] _out_pad_right Null cells at right of output tile.
+     * This method will apply input/output padding (if required) and call the
+     * depthwise tile implementation.
      */
-    typedef void (*TileFn)(
+    void process_tile(
+      unsigned int threadid,
       int n_channels,
-      const TIn* const weights,
-      int weight_row_stride,
-      int weight_col_stride,
-      const TIn* const inptr,
-      int in_row_stride,
-      int in_col_stride,
-      TOut* const outptr,
-      int out_row_stride,
-      int out_col_stride,
-      int _in_pad_top,
-      int _in_pad_left,
-      int _in_pad_bottom,
-      int _in_pad_right,
-      int _out_pad_bottom,
-      int _out_pad_right,
-      int _input_offset,
-      int _weights_offset
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
+      int pad_in_top,
+      int pad_in_left,
+      int pad_in_bottom,
+      int pad_in_right,
+      int pad_out_bottom,
+      int pad_out_right
     );
 
-    /* Arrays of methods to process tensor tiles.
-     *
-     * Allows dynamic dispatch to specialized implementations based on
-     * different padding configurations.
+    /** Perform depthwise convolution on a single tile.
      */
-    static const TileFn tilefn_unpadded;
-    static const TileFn tilefn_top[n_in_pad_top_fns];
-    static const TileFn tilefn_left[n_in_pad_left_fns];
-    static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns];
-    static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns];
-    static const TileFn tilefn_generic;
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      OutputType* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    int n_channels(void) const;
 
   private:
     // Member variables of instances of a convolution engine.
-    const TIn* const _weights;
-    const TIn* const _input;
-    TOut* const _output;
+    const InputType* _input;
+    OutputType* _output;
+    void* _packed_parameters;
+    void* _working_space;  // Per-thread working space
     const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
               _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
     const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
+    const nck::ActivationFunction _activation;
 
     // Stride information for a convolution instance
-    const int _weight_col_stride, _weight_row_stride;
-    const int _input_col_stride, _input_row_stride, _input_batch_stride;
-    const int _output_col_stride, _output_row_stride, _output_batch_stride;
-    int _input_offset, _weights_offset;
+    int _input_col_stride, _input_row_stride, _input_batch_stride;
+    const int _input_ws_col_stride, _input_ws_row_stride;
+    int _output_col_stride, _output_row_stride, _output_batch_stride;
+    const int _output_ws_col_stride, _output_ws_row_stride;
+
+    // Methods for getting access to working space
+    size_t _get_input_working_space_size(void) const;
+    size_t _get_output_working_space_size(void) const;
+
+    void *_get_input_working_space(unsigned int threadid) const;
+    void *_get_output_working_space(unsigned int threadid) const;
 };
 
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut
+>
+class DepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  TIn, TBias, TOut,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      TIn, TBias, TOut
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    using Base::DepthwiseConvolutionBase;
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const TIn* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      TOut* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float, float, float
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float16_t, float16_t, float16_t
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float16_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float16_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 }  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
new file mode 100644
index 0000000..4c1d883
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+
+namespace depthwise
+{
+
+namespace nck = neon_convolution_kernels;
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  uint8_t, int32_t, uint8_t,
+  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    uint8_t, int32_t, uint8_t,
+    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+  >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    static nck::ActivationFunction get_activation_fn(
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& output_quantisation
+    );
+
+    uint8_t _input_padding_value(void) const;
+
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      uint8_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+  private:
+    // Quantization parameters
+    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
+    const qasymm8::QAsymm8RescaleParams rescale_parameters;
+};
+
+}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index b33f276..674fc4d 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -31,101 +31,73 @@
  */
 
 #include <algorithm>
+#include <cstdint>
 #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 
 #pragma once
 
+#define MEMBERFN(TOUT) template <\
+  unsigned int OutputTileRows, unsigned int OutputTileColumns,\
+  unsigned int KernelRows, unsigned int KernelColumns,\
+  unsigned int StrideRows, unsigned int StrideColumns,\
+  typename TIn, typename TBias, typename TOut,\
+  typename Derived\
+> TOUT DepthwiseConvolutionBase<\
+  OutputTileRows, OutputTileColumns,\
+  KernelRows, KernelColumns,\
+  StrideRows, StrideColumns,\
+  TIn, TBias, TOut, Derived\
+>
+
+using namespace neon_convolution_kernels;
+
 namespace depthwise
 {
 
+template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
+struct PackParameters
+{
+  static void execute(
+    unsigned int n_channels,
+    void *buffer,
+    const void *weights,
+    unsigned int weight_row_stride,
+    unsigned int weight_col_stride,
+    const void *biases
+  );
+};
+
 const unsigned int CHANNEL_BLOCK = 16;
 
-namespace
-{
-  inline int pad_along_dim(
-    const bool padding_same,
-    const int kernel_dim,
-    const int stride_dim,
-    const int input_dim
-  )
-  {
-    if (!padding_same)
-      return 0;
-    if (input_dim % stride_dim)
-      return std::max(kernel_dim - (input_dim % stride_dim), 0);
-    else
-      return std::max(kernel_dim - stride_dim, 0);
-  }
-}  // namespace
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
-  const int dim_size, const bool same_padding
-)
-{
-  return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
-}
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
+MEMBERFN(int)::get_output_size(
   const int dim_size, const unsigned int padding_before, const unsigned int padding_after
 )
 {
-  return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR);
+  return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
 }
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
-  const int n_batches, const int n_input_rows, const int n_input_cols,
-  const int n_channels, const bool padding_same,
-  const TIn* const weights,
-  const TIn* const input,
-  TOut* const output,
-  const int weight_col_stride,
-  const int weight_row_stride,
-  const int input_col_stride,
-  const int input_row_stride,
-  const int input_batch_stride,
-  const int output_col_stride,
-  const int output_row_stride,
-  const int output_batch_stride
-) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>(
-  n_batches, n_input_rows, n_input_cols,
-  n_channels,
-  pad_along_dim(padding_same, KR, SR, n_input_rows) / 2,  /* top padding */
-  pad_along_dim(padding_same, KC, SC, n_input_cols) / 2,  /* left padding */
-  iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2),  /* bottom padding */
-  iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2),  /* right padding */
-  weights, input, output,
-  weight_col_stride, weight_row_stride,
-  input_col_stride, input_row_stride, input_batch_stride,
-  output_col_stride, output_row_stride, output_batch_stride
-)
+MEMBERFN(int)::output_size(
+  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
+) const
 {
+  return get_output_size(dim_size, padding_before, padding_after);
 }
 
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
-  const int n_batches, const int n_input_rows, const int n_input_cols,
+MEMBERFN()::DepthwiseConvolutionBase(
+  const int n_batches,
+  const int n_input_rows,
+  const int n_input_cols,
   const int n_channels,
+  ActivationFunction activation,
   const unsigned int padding_top,
   const unsigned int padding_left,
   const unsigned int padding_bottom,
-  const unsigned int padding_right,
-  const TIn* const weights,
-  const TIn* const input,
-  TOut* const output,
-  const int weight_col_stride,
-  const int weight_row_stride,
-  const int input_col_stride,
-  const int input_row_stride,
-  const int input_batch_stride,
-  const int output_col_stride,
-  const int output_row_stride,
-  const int output_batch_stride
-) : _weights(weights), _input(input), _output(output),
+  const unsigned int padding_right
+) : _input(nullptr), _output(nullptr),
+    _packed_parameters(nullptr),
+    _working_space(nullptr),
     _n_batches(n_batches),
     _n_input_rows(n_input_rows),
     _n_input_cols(n_input_cols),
@@ -138,37 +110,157 @@
     _padding_left(padding_left),
     _padding_bottom(padding_bottom),
     _padding_right(padding_right),
-    _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
-    _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
-    _input_col_stride(input_col_stride ? input_col_stride : _n_channels),
-    _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
-    _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
-    _output_col_stride(output_col_stride ? output_col_stride : _n_channels),
-    _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
-    _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
-    _input_offset(0), _weights_offset(0)
+    _activation(activation),
+    _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
+    _input_ws_col_stride(_n_channels),
+    _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols),
+    _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0),
+    _output_ws_col_stride(_n_channels),
+    _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns)
 {
 }
 
+MEMBERFN(void)::set_input(const void* const inptr)
+{
+  set_input(inptr, _n_channels);
+}
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
+{
+  set_input(inptr, _n_input_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
+{
+  set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+  _input = static_cast<const TIn *>(inptr);
+  _input_batch_stride = ld_batch;
+  _input_row_stride = ld_row;
+  _input_col_stride = ld_col;
+}
+
+MEMBERFN(void)::set_output(void* const outptr)
+{
+  set_output(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
+{
+  set_output(outptr, _n_output_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
+{
+  set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+  _output = static_cast<TOut *>(outptr);
+  _output_batch_stride = ld_batch;
+  _output_row_stride = ld_row;
+  _output_col_stride = ld_col;
+}
+
+MEMBERFN(size_t)::get_packed_params_size(void) const
+{
+  return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+}
+
+MEMBERFN(void)::set_packed_params_buffer(void *buffer)
+{
+  _packed_parameters = buffer;
+}
+
+MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
+{
+  static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
+}
+
+MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
+{
+  const unsigned int weight_col_stride = _n_channels;
+  const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
+  static_cast<const Derived *>(this)->pack_params(
+    buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(void)::pack_params(
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
+{
+  static_cast<const Derived *>(this)->_pack_params(
+    buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(void)::_pack_params(
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
+{
+  // Default implementation
+  PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
+    _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+  return nthreads * (
+    _get_input_working_space_size() + _get_output_working_space_size()
+  );
+}
+
+MEMBERFN(void)::set_working_space(void *buffer)
+{
+  _working_space = buffer;
+}
+
+MEMBERFN(size_t)::_get_input_working_space_size(void) const
+{
+  return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels;
+}
+
+MEMBERFN(size_t)::_get_output_working_space_size(void) const
+{
+  return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels;
+}
+
+MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
+{
+  return static_cast<uint8_t*>(_working_space) + threadid * (
+    _get_input_working_space_size() + _get_output_working_space_size()
+  );
+}
+
+MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
+{
+  return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
+}
+
+MEMBERFN(unsigned int)::get_window() const
 {
   // Parallelise over blocks of channels.
   return iceildiv(_n_channels, CHANNEL_BLOCK);
 }
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
-{
-    _input_offset = input_offset;
-    _weights_offset = weights_offset;
-}
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
+MEMBERFN(void)::run(
   const unsigned int start,
-  const unsigned int stop
+  const unsigned int stop,
+  const unsigned int threadid
 )
 {
   // Parallelise over blocks of channels
@@ -205,43 +297,38 @@
       const int output_row_bottom = (tile_i + 1)*output_tile_rows;
       const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
 
+      // Get the offset into the packed parameters
+      const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
+        start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+
       // Process the row
       process_tile_row(
+        threadid,
         stop_channel - start_channel,
-        _weights + start_channel, _weight_row_stride, _weight_col_stride,
-        inptr_row + start_channel, _input_row_stride, _input_col_stride,
-        outptr_row + start_channel, _output_row_stride, _output_col_stride,
+        params_ptr,
+        inptr_row + start_channel,
+        outptr_row + start_channel,
         input_row_pad_top, input_pad_left, input_row_pad_bottom,
         output_row_pad_bottom,
-        _n_tile_cols, _n_input_cols, _n_output_cols,
-        _input_offset, _weights_offset
+        _n_tile_cols, _n_input_cols, _n_output_cols
       );
     }
   }
 }
 
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
+MEMBERFN(void)::process_tile_row(
+  const unsigned int threadid,
   const int n_channels,
-  const TIn* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
+  const void* const packed_params,
   const TIn* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
   TOut* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
   const int row_pad_in_top,
   const int row_pad_in_left,
   const int row_pad_in_bottom,
   const int row_pad_out_bottom,
   const int n_tiles,
   const int n_input_cols,
-  const int n_output_cols,
-  const int input_offset,
-  const int weights_offset
+  const int n_output_cols
 )
 {
   constexpr int tile_overlap = kernel_cols - stride_cols;
@@ -261,264 +348,97 @@
 
     // Get pointers into the inputs and outputs
     const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
-    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride);
-    TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
+    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
+    TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
 
-    // Apply the specific tile processing function
-    const bool pad_top = row_pad_in_top > 0;
-    const bool pad_left = t_pad_in_left > 0;
-    const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom;
-    const bool pad_right = t_pad_in_right || t_pad_out_right;
-
-    const TileFn tilefn = [&] () {
-      if (!pad_top && !pad_left && !pad_bottom && !pad_right)
-      {
-        // No padding
-        return tilefn_unpadded;
-      }
-      else if (pad_top && !pad_left && !pad_bottom && !pad_right)
-      {
-        // Padding on the top only, subtract off the minimum expected padding in
-        // order to index into the array of specialised methods.
-        const int index = row_pad_in_top - min_in_pad_top;
-        return tilefn_top[index];
-      }
-      else if (!pad_top && pad_left && !pad_bottom && !pad_right)
-      {
-        // Padding on the left only, subtract off the minimum expected padding in
-        // order to index into the array of specialised methods.
-        const int index = t_pad_in_left - min_in_pad_left;
-        return tilefn_left[index];
-      }
-      else if (!pad_top && !pad_left && pad_bottom && !pad_right)
-      {
-        // Padding on the bottom only
-        return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
-      }
-      else if (!pad_top && !pad_left && !pad_bottom && pad_right)
-      {
-        // Padding on the right only
-        return tilefn_right[t_pad_in_right][t_pad_out_right];
-      }
-      else
-      {
-        // Otherwise use generic tile processing method.
-        return tilefn_generic;
-      }
-    }();
-
-    tilefn(
-      n_channels,
-      weights, weight_row_stride, weight_col_stride,
-      inptr_col, in_row_stride, in_col_stride,
-      outptr_col, out_row_stride, out_col_stride,
-      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
-      row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
+    // Process just this tile
+    process_tile(
+      threadid, n_channels, packed_params, inptr_col, outptr_col,
+      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,  // Input paddings
+      row_pad_out_bottom, t_pad_out_right  // Output paddings
     );
   }
 }
 
-
-// New templated struct used solely as a way to provide tile processing
-// specialisations.
-template <int OutputTileRows, int OutputTileCols,
-          int KernelRows, int KernelCols,
-          int StrideRows, int StrideCols,
-          typename TIn, typename TOut>
-struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols, TIn, TOut
->
+MEMBERFN(TIn)::_input_padding_value(void) const
 {
-  typedef DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TOut
-  > DWC;
+  return static_cast<TIn>(0);
+}
 
-  /** Perform the depthwise convolution of a tile.
-   *
-   * @param[in] n_channels Number of channels.
-   * @param[in] weights Pointer to Height x Width x Channels ordered weights.
-   * @param[in] inptr Pointer to the top-left unpadded value of the tile.
-   * @param[in] in_row_stride Stride between rows of the input tensor.
-   * @param[in] in_col_stride Stride between columns of the input tensor.
-   * @param[out] outptr Pointer to the top-left output value for the tile.
-   * @param[in] out_row_stride Stride between rows of the output tensor.
-   * @param[in] out_col_stride Stride between columns of the output tensor.
-   *
-   * The following parameters may be ignored if the function has been
-   * specialised for specific padding constraints.
-   *
-   * @param[in] _in_pad_top Padding to apply to top of input tile.
-   * @param[in] _in_pad_left Padding to apply to left of input tile.
-   * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
-   * @param[in] _in_pad_right Padding to apply to right of input tile.
-   * @param[in] _out_pad_bottom Null cells at bottom of output tile.
-   * @param[in] _out_pad_right Null cells at right of output tile.
-   */
-  template <
-    bool Specialize=false,  // Specialize (or not) the method
-    int InPadTop=0,         // If specialized, top padding
-    int InPadLeft=0,        // If specialized, left padding
-    int InPadBottom=0,      // If specialized, bottom padding
-    int InPadRight=0,       // If specialized, right padding
-    int OutPadBottom=0,     // If specialized, bottom output padding
-    int OutPadRight=0       // If specialized, bottom right padding
-  >
-  static void process_tile(
-    const int n_channels,
-    const TIn* const weights,
-    const int weight_row_stride,
-    const int weight_col_stride,
-    const TIn* const inptr,
-    const int in_row_stride,
-    const int in_col_stride,
-    TOut* const outptr,
-    const int out_row_stride,
-    const int out_col_stride,
-    const int in_pad_top=0,
-    const int in_pad_left=0,
-    const int in_pad_bottom=0,
-    const int in_pad_right=0,
-    const int out_pad_bottom=0,
-    const int out_pad_right=0,
-    const int input_offset=0,
-    const int weights_offset=0
-  );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-template <
-  bool Specialize,
-  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-  int OutPadBottom, int OutPadRight
->
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
+MEMBERFN(void)::process_tile(
+  const unsigned int threadid,
   const int n_channels,
-  const TIn *__restrict__ const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const TIn *__restrict__ const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  TOut *__restrict__ const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int _in_pad_top,
-  const int _in_pad_left,
-  const int _in_pad_bottom,
-  const int _in_pad_right,
-  const int _out_pad_bottom,
-  const int _out_pad_right,
-  const int _input_offset,
-  const int _weights_offset
+  const void* const packed_params,
+  const TIn* const inptr,
+  TOut* const outptr,
+  const int pad_in_top,
+  const int pad_in_left,
+  const int pad_in_bottom,
+  const int pad_in_right,
+  const int pad_out_bottom,
+  const int pad_out_right
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
+  const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
+  const bool pad_output = pad_out_bottom || pad_out_right;
 
-  // Extract parameters
-  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
-  // Instantiate pointers
-  const TIn* __restrict__ inptr_base = inptr;
-  const TIn* __restrict__ wptr_base = weights;
-  TOut* __restrict__ outptr_base = outptr;
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining; channels_remaining--)
+  if (pad_input)
   {
-    // Load input tile
-    TIn u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<TIn>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
-      }
-    }
-    inptr_base++;
-
-    // Load weights tile
-    TIn w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
-    {
-      const TIn* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
-      {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
-      }
-    }
-    wptr_base++;
-
-    // Perform the convolution
-    TOut v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
-    {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = static_cast<TOut>(0);
-
-        // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
-
-        // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
-        {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-      }
-    }
-
-    // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
-    {
-      TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
+    // Copy the input into the temporary buffer, applying padding
+    padding::copy_and_pad_tile<TIn>(
+      inner_tile_rows, inner_tile_cols, n_channels,
+      inptr, _input_row_stride, _input_col_stride,
+      static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride,
+      pad_in_top, pad_in_left, pad_in_bottom, pad_in_right,
+      static_cast<Derived *>(this)->_input_padding_value()
+    );
   }
+
+  // Execute the kernel
+  const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid));
+  const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride;
+  const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride;
+
+  TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid));
+  const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride;
+  const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride;
+
+  Derived * dthis = static_cast<Derived *>(this);
+
+  switch(_activation)
+  {
+    case ActivationFunction::ReLU:
+      dthis->template execute_tile<ActivationFunction::ReLU>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
+    case ActivationFunction::ReLU6:
+      dthis->template execute_tile<ActivationFunction::ReLU6>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
+    default:
+      dthis->template execute_tile<ActivationFunction::None>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
+  }
+
+  if (pad_output)
+  {
+    // Copy the output from the temporary buffer, removing unnecessary values
+    padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute(
+      n_channels * sizeof(TOut),
+      _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut),
+      outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut),
+      0, 0, pad_out_bottom, pad_out_right
+    );
+  }
+}
+
+MEMBERFN(int)::n_channels(void) const
+{
+  return _n_channels;
 }
 
 }  // namespace depthwise
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
new file mode 100644
index 0000000..704e178
--- /dev/null
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MISC_INFO_HELPERS_H__
+#define __ARM_COMPUTE_MISC_INFO_HELPERS_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace utils
+{
+namespace info_helpers
+{
+/** Checks if activation information correspond to a relu activation function
+ *
+ * @param[in] activation_info Activation metadata
+ *
+ * @return True if activation metadata correspond to a relu activation else false
+ */
+inline bool is_relu(ActivationLayerInfo activation_info)
+{
+    return activation_info.enabled() && activation_info.activation() == ActivationLayerInfo::ActivationFunction::RELU;
+}
+
+/** Checks if activation information correspond to a relu6 activation function
+ *
+ * @param[in] activation_info Activation metadata
+ *
+ * @return True if activation metadata correspond to a relu6 activation else false
+ */
+inline bool is_relu6(ActivationLayerInfo activation_info)
+{
+    return activation_info.enabled()
+           && activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+           && activation_info.a() == 6.f && activation_info.b() == 0.f;
+}
+} // namespace info_helpers
+} // namespace utils
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_MISC_INFO_HELPERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index e2fe11e..28f0560 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -53,7 +54,15 @@
 {
 public:
     /** Default constructor */
-    NEDepthwiseConvolutionLayer3x3();
+    NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayer3x3(const NEDepthwiseConvolutionLayer3x3 &) = delete;
+    /** Default move constructor */
+    NEDepthwiseConvolutionLayer3x3(NEDepthwiseConvolutionLayer3x3 &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayer3x3 &operator=(const NEDepthwiseConvolutionLayer3x3 &) = delete;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionLayer3x3 &operator=(NEDepthwiseConvolutionLayer3x3 &&) = default;
     /** Initialize the function's source, destination, kernels and border_size.
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
@@ -86,9 +95,44 @@
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
+    /** Configure the kernels/functions for the generic pipeline.
+     *
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                                  Data type supported: Same as @p input.
+     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]      conv_info        Padding and stride information to use for the convolution.
+     * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]      act_info         Activation layer information in case of a fused activation.
+     */
+    void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                           unsigned int depth_multiplier, const ActivationLayerInfo &act_info);
+    /** Configure the kernels/functions for the optimized pipeline.
+     *
+     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input.
+     * @param[out] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]  conv_info        Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info         Activation layer information in case of a fused activation.
+     */
+    void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info);
+    /** Run generic kernel */
+    void run_generic();
+    /** Run optimized function */
+    void run_optimized();
+
+private:
+    MemoryGroup                               _memory_group;
     NEDepthwiseConvolutionLayer3x3Kernel      _dwc_kernel;
+    NEDepthwiseConvolutionAssemblyDispatch    _dwc_optimized_func;
     NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
     NEFillBorderKernel                        _border_handler;
     NEPermute                                 _permute_input;
@@ -99,14 +143,14 @@
     Tensor                                    _permuted_input;
     Tensor                                    _permuted_weights;
     Tensor                                    _permuted_output;
+    const ITensor                            *_original_weights;
     bool                                      _has_bias;
     bool                                      _is_quantized;
     bool                                      _is_optimized;
-    bool                                      _are_weights_reshaped;
     bool                                      _is_nchw;
-    bool                                      _is_first_run;
     bool                                      _permute;
     bool                                      _is_activationlayer_enabled;
+    bool                                      _is_prepared;
 };
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
new file mode 100644
index 0000000..df8f29d
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__
+#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+
+namespace arm_compute
+{
+/** Depthwise convolution assembly kernel glue */
+class NEDepthwiseConvolutionAssemblyDispatch : public IFunction
+{
+public:
+    /** Default constructor
+     *
+     * @param[in,out] memory_manager Memory manager to use
+     */
+    NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
+    /** Default move constructor */
+    NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input.
+     * @param[out] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]  conv_info        Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                              Data type supported: Same as @p input.
+     * @param[out] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in]  conv_info        Padding and stride information to use for the convolution.
+     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return An error status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
+                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Check if the optimized kernel can be used for the given kernel sizes and strides
+     *
+     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+     *
+     * @param[in] input            Input tensor info.
+     * @param[in] weights          Weights tensor info.
+     * @param[in] conv_info        Convolution layer metadata.
+     * @param[in] depth_multiplier (Optional) Depth multiplier to be used.
+     *
+     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+     */
+    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    MemoryGroup                                       _memory_group;
+    const ITensor                                    *_input;
+    const ITensor                                    *_weights;
+    const ITensor                                    *_bias;
+    ITensor                                          *_output;
+    Tensor                                            _packed_weights;
+    Tensor                                            _workspace;
+    bool                                              _is_prepared;
+    std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel;
+    NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */