COMPMID-617 Add window validation to CLDirectConvolutionLayer
Change-Id: Ia642dc68de6a0afe697bbce392e7ee955fa8944b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111460
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index aea0161..df0578b 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -60,269 +60,8 @@
return output_shape;
}
-} // namespace
-CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
- : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
-{
-}
-
-BorderSize CLDirectConvolutionLayerKernel::border_size() const
-{
- return _border_size;
-}
-
-void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- const unsigned int kernel_size = weights->info()->dimension(0);
- const DataType data_type = input->info()->data_type();
-
- // Get convolved dimensions
- TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(),
- output_shape,
- 1,
- input->info()->data_type(),
- input->info()->fixed_point_position(),
- input->info()->quantization_info());
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLDirectConvolutionLayerKernel::validate(input->info(),
- weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info(),
- conv_info));
-
- _conv_stride_x = std::get<0>(conv_info.stride());
- _conv_stride_y = std::get<1>(conv_info.stride());
-
- _input = input;
- _weights = weights;
- _output = output;
- _biases = biases;
-
- int conv_pad_left = std::min(conv_info.pad_left(), kernel_size / 2);
- int conv_pad_top = std::min(conv_info.pad_top(), kernel_size / 2);
- int conv_pad_right = std::min(conv_info.pad_right(), kernel_size / 2);
- int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);
- _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-
- const GPUTarget gpu_target = get_arch_from_target(get_target());
-
- std::stringstream kernel_name;
- kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
- CLBuildOptions build_options;
- build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
-
- if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
- {
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
-
- kernel_name << "_f32_bifrost";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- unsigned int num_elems_read_per_iteration_x = 0;
- unsigned int num_elems_read_per_iteration_y = 0;
- unsigned int num_elems_written_per_iteration_x = 0;
- unsigned int num_elems_written_per_iteration_y = 0;
-
- // Through extensive experimentation with over 30 representative tensor
- // shapes, we found a small number of local work size configurations
- // that result in nearly optimal execution times. Selecting the right
- // lws for a given shape, however, required a complex decision tree,
- // until we constructed a simple feature as described below.
- //
- // We started from the number of multiply-accumulate operations for a
- // convolution layer, which is equal to the product of the input
- // dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
- // this resulted in ties between distinct shapes that required distinct
- // lws configurations. Replacing the width of the input with the kernel
- // size, however, resulted in nearly optimal predictions. We use underscores
- // in variable names to indicate when they are intentionally misleading.
- const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
- const size_t product_of_input_dimensions_ = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
- const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
-
- switch(kernel_size)
- {
- case 1:
- {
- num_elems_read_per_iteration_x = 4;
- num_elems_read_per_iteration_y = 4;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 4;
- if(mega_ops_ < 1.f)
- {
- _lws_hint = cl::NDRange(1, 1, 8);
- }
- else if(mega_ops_ < 7.f)
- {
- _lws_hint = cl::NDRange(1, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(1, 1, 2);
- }
- break;
- }
- case 3:
- {
- num_elems_read_per_iteration_x = 6;
- num_elems_read_per_iteration_y = 5;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 3;
- if(mega_ops_ < 1.f)
- {
- _lws_hint = cl::NDRange(1, 1, 8);
- }
- else if(mega_ops_ < 13.f)
- {
- _lws_hint = cl::NDRange(2, 1, 4);
- }
- else if(mega_ops_ < 50.f)
- {
- _lws_hint = cl::NDRange(3, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(2, 1, 6);
- }
- break;
- }
- case 5:
- {
- num_elems_read_per_iteration_x = 8;
- num_elems_read_per_iteration_y = 6;
- num_elems_written_per_iteration_x = 4;
- num_elems_written_per_iteration_y = 2;
- if(mega_ops_ < 2.f || mega_ops_ > 80.f)
- {
- _lws_hint = cl::NDRange(2, 1, 4);
- }
- else
- {
- _lws_hint = cl::NDRange(2, 1, 8);
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
- }
- }
-
- // Calculate right and bottom border
- const int input_width = input->info()->dimension(0) - kernel_size / 2 + conv_pad_right;
- const int input_height = input->info()->dimension(1) - kernel_size / 2 + conv_pad_bottom;
-
- // Create window and update padding
- win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowStatic input_access(input->info(), -conv_pad_left, -conv_pad_top, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
- AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
- update_window_and_padding(win, input_access, weights_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
- }
- else
- {
- bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);
- bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
- DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
-
- build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
- build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
- build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
- build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
- build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- build_options.add_option_if(is_quantized_fixed_point,
- std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
- build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
- build_options.options()));
-
- // Configure kernel window
-
- bool is_stride2 = ((kernel_size != 1) && (_conv_stride_x == 2));
-
- const unsigned int num_elems_read_per_iteration_x = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);
- const unsigned int num_elems_read_per_iteration_y = kernel_size;
- const unsigned int num_elems_written_per_iteration_x = 8;
- const unsigned int num_elems_written_per_iteration_y = 1;
-
- // Calculate right and bottom border
- const int input_width = input->info()->dimension(0) - kernel_size / 2 + conv_pad_right;
- const int input_height = input->info()->dimension(1) - kernel_size / 2 + conv_pad_bottom;
-
- // Create window and update padding
- Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
- AccessWindowStatic input_access(input->info(), -conv_pad_left, -conv_pad_top, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
- AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
- update_window_and_padding(win, input_access, weights_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
- }
-
- // Set static kernel arguments
- if(is_data_type_quantized_asymmetric(data_type))
- {
- int output_multiplier = 0;
- int output_shift = 0;
-
- float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
- ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
-
- unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
- _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
- _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
- _kernel.setArg(idx++, _output->info()->quantization_info().offset);
- _kernel.setArg(idx++, output_multiplier);
- _kernel.setArg(idx++, output_shift);
- }
-
- // Set config_id for enabling LWS tuning
- _config_id = "direct_convolution_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += support::cpp11::to_string(kernel_size);
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_pad_left);
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_pad_top);
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_pad_right);
- _config_id += "_";
- _config_id += support::cpp11::to_string(conv_pad_bottom);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_x);
- _config_id += "_";
- _config_id += support::cpp11::to_string(_conv_stride_y);
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-Error CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -369,6 +108,326 @@
return Error{};
}
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
+{
+ const unsigned int kernel_size = weights->dimension(0);
+ const DataType data_type = input->data_type();
+
+ // Get convolved dimensions
+ TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+ auto_init_if_empty(*output, output_shape,
+ 1,
+ input->data_type(),
+ input->fixed_point_position(),
+ input->quantization_info());
+
+ unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ unsigned int conv_pad_left = std::min(conv_info.pad_left(), kernel_size / 2);
+ unsigned int conv_pad_top = std::min(conv_info.pad_top(), kernel_size / 2);
+ unsigned int conv_pad_right = std::min(conv_info.pad_right(), kernel_size / 2);
+ unsigned int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);
+
+ unsigned int num_elems_read_per_iteration_x = 0;
+ unsigned int num_elems_read_per_iteration_y = 0;
+ unsigned int num_elems_written_per_iteration_x = 0;
+ unsigned int num_elems_written_per_iteration_y = 0;
+
+ Window win = Window();
+ bool window_changed = false;
+
+ if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
+ {
+ // Configure kernel window
+ win = calculate_max_window(*output);
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ num_elems_read_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 4;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 4;
+ break;
+ }
+ case 3:
+ {
+ num_elems_read_per_iteration_x = 6;
+ num_elems_read_per_iteration_y = 5;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 3;
+ break;
+ }
+ case 5:
+ {
+ num_elems_read_per_iteration_x = 8;
+ num_elems_read_per_iteration_y = 6;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 2;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
+ }
+ }
+ }
+ else
+ {
+ bool is_stride2 = ((kernel_size != 1) && (conv_stride_x == 2));
+
+ num_elems_read_per_iteration_x = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);
+ num_elems_read_per_iteration_y = kernel_size;
+ num_elems_written_per_iteration_x = 8;
+ num_elems_written_per_iteration_y = 1;
+ }
+
+ // Calculate right and bottom border
+ int input_width = input->dimension(0) - kernel_size / 2 + conv_pad_right;
+ int input_height = input->dimension(1) - kernel_size / 2 + conv_pad_bottom;
+
+ // Add padding only if necessary or it would always result in a window_changed
+ if(input_width % num_elems_read_per_iteration_x > 0)
+ {
+ input_width += num_elems_read_per_iteration_x;
+ }
+ if(input_height % num_elems_read_per_iteration_y > 0)
+ {
+ input_height += num_elems_read_per_iteration_y;
+ }
+
+ // Create window and update padding
+ win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+ AccessWindowStatic input_access(input, -conv_pad_left, -conv_pad_top, input_width, input_height);
+ AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+ window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
+ : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+{
+}
+
+BorderSize CLDirectConvolutionLayerKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const unsigned int kernel_size = weights->info()->dimension(0);
+ const DataType data_type = input->info()->data_type();
+
+ // Get convolved dimensions
+ TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+ auto_init_if_empty(*output->info(),
+ output_shape,
+ 1,
+ input->info()->data_type(),
+ input->info()->fixed_point_position(),
+ input->info()->quantization_info());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+ weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr,
+ output->info(),
+ conv_info));
+
+ _conv_stride_x = std::get<0>(conv_info.stride());
+ _conv_stride_y = std::get<1>(conv_info.stride());
+
+ _input = input;
+ _weights = weights;
+ _output = output;
+ _biases = biases;
+
+ int conv_pad_left = std::min(conv_info.pad_left(), kernel_size / 2);
+ int conv_pad_top = std::min(conv_info.pad_top(), kernel_size / 2);
+ int conv_pad_right = std::min(conv_info.pad_right(), kernel_size / 2);
+ int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);
+ _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
+
+ const GPUTarget gpu_target = get_arch_from_target(get_target());
+
+ std::stringstream kernel_name;
+ kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+ CLBuildOptions build_options;
+ build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
+
+ if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
+ {
+ build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
+
+ kernel_name << "_f32_bifrost";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
+
+ // Through extensive experimentation with over 30 representative tensor
+ // shapes, we found a small number of local work size configurations
+ // that result in nearly optimal execution times. Selecting the right
+ // lws for a given shape, however, required a complex decision tree,
+ // until we constructed a simple feature as described below.
+ //
+ // We started from the number of multiply-accumulate operations for a
+ // convolution layer, which is equal to the product of the input
+ // dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
+ // this resulted in ties between distinct shapes that required distinct
+ // lws configurations. Replacing the width of the input with the kernel
+ // size, however, resulted in nearly optimal predictions. We use underscores
+ // in variable names to indicate when they are intentionally misleading.
+ const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
+ const size_t product_of_input_dimensions_ = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
+ const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ _lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 7.f)
+ {
+ _lws_hint = cl::NDRange(1, 1, 4);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(1, 1, 2);
+ }
+ break;
+ }
+ case 3:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ _lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 13.f)
+ {
+ _lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else if(mega_ops_ < 50.f)
+ {
+ _lws_hint = cl::NDRange(3, 1, 4);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(2, 1, 6);
+ }
+ break;
+ }
+ case 5:
+ {
+ if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+ {
+ _lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(2, 1, 8);
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
+ }
+ }
+ }
+ else
+ {
+ bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);
+ bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
+ DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
+
+ build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
+ build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+ build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+ build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
+ build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
+ build_options.add_option_if(is_quantized_fixed_point,
+ std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+ build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
+ build_options.options()));
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set static kernel arguments
+ if(is_data_type_quantized_asymmetric(data_type))
+ {
+ int output_multiplier = 0;
+ int output_shift = 0;
+
+ float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+ ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+
+ unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
+ _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
+ _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
+ _kernel.setArg(idx++, _output->info()->quantization_info().offset);
+ _kernel.setArg(idx++, output_multiplier);
+ _kernel.setArg(idx++, output_shift);
+ }
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "direct_convolution_";
+ _config_id += lower_string(string_from_data_type(data_type));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_size);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_pad_left);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_pad_top);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_pad_right);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(conv_pad_bottom);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Error CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const GPUTarget target)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
+
+ return Error{};
+}
+
void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);