Remove padding from direct convolution - OpenCL

- Refactor direct convolution for NHWC
- Remove old kernels for NHWC
- Change the heuristic in CLConvolutionLayer.cpp. The new direct
  convolution implementation is faster than FFT

Resolves COMPMID-3908

Change-Id: Iee15ce7b04e21847b6eaae5c6d3c1b18180e7efc
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4876
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 8884521..91ff35b 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,6 @@
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                    "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
                                     "Weights feature map dimension should match the respective input's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
@@ -63,6 +61,20 @@
                                     && std::get<0>(conv_info.stride()) > 2,
                                     "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
 
+    if(data_layout == DataLayout::NCHW)
+    {
+        if(is_data_type_quantized(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantised data types");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
+                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+        }
+    }
+
     if(biases != nullptr)
     {
         if(is_data_type_quantized_asymmetric(input->data_type()))
@@ -102,8 +114,8 @@
     return Status{};
 }
 
-inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                 DataType data_type, DataLayout data_layout)
+inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
+                                                      DataType data_type, DataLayout data_layout)
 {
     return gpu_target_is_in(gpu_target,
                             GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
@@ -115,29 +127,16 @@
            && (data_layout == DataLayout::NCHW);
 }
 
-inline bool can_run_optimized_kernel_for_bifrost_nhwc(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size == 9)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NHWC);
-}
-
-inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                            unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                            unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
+inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
+                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
+                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
 {
     const DataType   data_type     = input->data_type();
     const DataLayout data_layout   = input->data_layout();
     unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
     unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
 
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
+    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
 
     if(run_optimized_bifrost)
     {
@@ -174,7 +173,7 @@
             }
         }
     }
-    else if(data_layout == DataLayout::NCHW)
+    else
     {
         num_elems_read_per_iteration_y    = kernel_size;
         num_elems_written_per_iteration_x = 8;
@@ -253,97 +252,13 @@
                 ARM_COMPUTE_ERROR("Invalid direct convolution size");
         }
     }
-    else // data_layout == NHWC
-    {
-        const bool run_optimized_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-        num_elems_written_per_iteration_x = 1;
-
-        if(run_optimized_bifrost_nhwc)
-        {
-            num_elems_read_per_iteration_x = 4;
-        }
-        else
-        {
-            num_elems_read_per_iteration_x = 1;
-        }
-
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_y    = 8;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_y    = 16;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_y    = 10;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_y    = 17;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_y    = 12;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_y    = 20;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_y    = 16;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_y    = 24;
-                        num_elems_written_per_iteration_y = 8;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented.");
-                break;
-        }
-    }
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
 {
-    const DataLayout   data_layout = input->data_layout();
-    const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_size = weights->dimension(width_idx);
+    const DataLayout data_layout = input->data_layout();
 
-    // Get convolved dimensions
+    // Get output shape
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
@@ -352,38 +267,39 @@
                        input->data_type(),
                        input->quantization_info());
 
-    unsigned int num_elems_read_per_iteration_x    = 0;
-    unsigned int num_elems_read_per_iteration_y    = 0;
-    unsigned int num_elems_written_per_iteration_x = 0;
-    unsigned int num_elems_written_per_iteration_y = 0;
-
-    unsigned int conv_pad_left = conv_info.pad_left();
-    unsigned int conv_pad_top  = conv_info.pad_top();
-    unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-    setup_num_elems(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                    num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                    kernel_size, conv_info, target, input);
-
-    // Create window and update padding
-    bool   window_changed = false;
-    Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
     if(data_layout == DataLayout::NHWC)
     {
-        AccessWindowStatic input_access(input, 0, -conv_pad_left,
-                                        ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x),
-                                        ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y));
-        AccessWindowStatic    weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1));
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        const unsigned int vec_size = std::min(static_cast<unsigned int>(output->tensor_shape()[0]), 4u);
+
+        // Create window and update padding
+        Window win = calculate_max_window(*output, Steps(vec_size, 1U));
+        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+        Status err = Status{};
         return std::make_pair(err, win);
     }
     else if(data_layout == DataLayout::NCHW)
     {
+        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const unsigned int kernel_size = weights->dimension(width_idx);
+
+        unsigned int num_elems_read_per_iteration_x    = 0;
+        unsigned int num_elems_read_per_iteration_y    = 0;
+        unsigned int num_elems_written_per_iteration_x = 0;
+        unsigned int num_elems_written_per_iteration_y = 0;
+
+        unsigned int conv_pad_left = conv_info.pad_left();
+        unsigned int conv_pad_top  = conv_info.pad_top();
+        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+
+        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
+                             kernel_size, conv_info, target, input);
+
+        // Create window and update padding
+        bool   window_changed = false;
+        Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
         AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
         AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
         AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
@@ -419,25 +335,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    _data_layout          = input->info()->data_layout();
-    const int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int kernel_size = weights->info()->dimension(width_idx);
-    const DataType     data_type   = input->info()->data_type();
-
-    // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       output_shape,
-                       1,
-                       input->info()->data_type(),
-                       input->info()->quantization_info());
-
-    // Perform validation step
+    // Perform validation
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
                                                   weights->info(),
                                                   (biases != nullptr) ? biases->info() : nullptr,
@@ -446,72 +344,64 @@
 
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
+    _data_layout   = input->info()->data_layout();
+    _input         = input;
+    _weights       = weights;
+    _output        = output;
+    _biases        = biases;
 
-    if(_data_layout == DataLayout::NHWC)
-    {
-        _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
-    }
-    else if(_data_layout == DataLayout::NCHW)
-    {
-        _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    _input   = input;
-    _weights = weights;
-    _output  = output;
-    _biases  = biases;
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int kernel_size = weights->info()->dimension(width_idx);
+    const DataType     data_type   = input->info()->data_type();
 
     const GPUTarget gpu_target = get_target();
 
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
     std::stringstream kernel_name;
-    kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+    CLBuildOptions    build_options;
+
     if(_data_layout == DataLayout::NHWC)
     {
-        kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
-    }
+        _border_size = BorderSize();
 
-    CLBuildOptions build_options;
-    build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
+        kernel_name << "direct_convolution_nhwc";
 
-    const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
+        const unsigned int n0               = win_config.second.x().step();
+        const unsigned int m0               = win_config.second.y().step();
+        const unsigned int k0               = std::min(static_cast<unsigned int>(_input->info()->dimension(channel_idx)), 16u);
+        const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0;
+        const unsigned int partial_store_m0 = _output->info()->dimension(channel_idx) % m0;
+        const unsigned int pad_left         = conv_info.pad_left();
+        const unsigned int pad_top          = conv_info.pad_top();
 
-    if(run_optimized_for_bifrost)
-    {
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-
-        kernel_name << "_f32_bifrost";
-        _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-    }
-    else
-    {
-        build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-        build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
-        build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
-        if(_data_layout == DataLayout::NHWC)
-        {
-            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
-            build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
-            build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
-            build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
-            build_options.add_option(std::string("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx))));
-            build_options.add_option(std::string("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx))));
-            build_options.add_option(std::string("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())));
-            build_options.add_option(std::string("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())));
-            build_options.add_option(std::string("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())));
-            build_options.add_option(std::string("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)));
-            if(run_optimized_for_bifrost_nhwc)
-            {
-                const unsigned int num_elems_read_per_iteration_x = 4;
-                _border_size.right                                = num_elems_read_per_iteration_x;
-                build_options.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_read_per_iteration_x));
-            }
-        }
-        build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
+        build_options.add_option_if(_biases != nullptr, std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(_biases->info()->data_type())));
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx)));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(_input->info()->dimension(channel_idx)));
+        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)));
+        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)));
+        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->dimension(channel_idx)));
+        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type()));
+        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(width_idx)));
+        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(height_idx)));
+        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
+        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
+        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+        build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+        build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
 
         if(is_data_type_quantized(data_type))
         {
@@ -523,33 +413,74 @@
             int   output_multiplier = 0;
             int   output_shift      = 0;
             quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-            build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-            build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-            build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-
-            // Create kernel
-            _kernel = create_kernel(compile_context, "direct_convolution_quantized", build_options.options());
-
-            // Set static kernel arguments
-            unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
-            _kernel.setArg(idx++, -iqinfo.offset);
-            _kernel.setArg(idx++, -wqinfo.offset);
-            _kernel.setArg(idx++, oqinfo.offset);
+            build_options.add_option("-DIS_QUANTISED");
+            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
         }
         else
         {
-            // Create kernel
-            _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+        }
+    }
+    else
+    {
+        _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
+
+        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+        build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
+
+        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
+
+        if(run_optimized_for_bifrost)
+        {
+            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
+
+            kernel_name << "_f32_bifrost";
+        }
+        else
+        {
+            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
+            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
+            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+
+            if(is_data_type_quantized(data_type))
+            {
+                const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
+                const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
+                const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
+
+                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+                int   output_multiplier = 0;
+                int   output_shift      = 0;
+                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+
+                kernel_name.str("direct_convolution_quantized");
+            }
         }
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
 
     // Set config_id for enabling LWS tuning
-    _config_id = "direct_convolution_";
+    _config_id = kernel_name.str();
+    _config_id += "_";
     _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
     _config_id += support::cpp11::to_string(kernel_size);
@@ -588,38 +519,58 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     // Get initial windows
-    Window slice  = window.first_slice_window_3D();
-    Window win_in = window;
+    Window slice = window.first_slice_window_3D();
 
-    win_in.adjust(Window::DimX, -_border_size.left, true);
-    win_in.adjust(Window::DimY, -_border_size.top, true);
-
-    const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
-    win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
-
-    Window       slice_in = win_in.first_slice_window_3D();
-    unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
-    add_3D_tensor_argument(idx1, _weights, slice);
-
-    if(_biases != nullptr)
+    if(_data_layout == DataLayout::NHWC)
     {
-        Window slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, _biases, slice_biases);
-    }
+        slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1) * _output->info()->dimension(2), 1));
+        slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(3), 1));
 
-    _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
-
-    do
-    {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _weights, slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, slice);
+        }
+        _kernel.setArg(idx++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, lws_hint());
     }
-    while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+    else
+    {
+        Window win_in = window;
+
+        win_in.adjust(Window::DimX, -_border_size.left, true);
+        win_in.adjust(Window::DimY, -_border_size.top, true);
+
+        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+        win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
+        win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
+
+        Window       slice_in = win_in.first_slice_window_3D();
+        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
+        add_3D_tensor_argument(idx1, _weights, slice);
+
+        if(_biases != nullptr)
+        {
+            Window slice_biases;
+            slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+            add_1D_tensor_argument(idx1, _biases, slice_biases);
+        }
+
+        _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
+
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, slice_in);
+            add_3D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice, lws_hint());
+        }
+        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+    }
 }
 } // namespace arm_compute