src/gpu/cl/kernels/ClSoftmaxKernel.cpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"

 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/core/Dimensions.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"

 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"

 #include <string>

 namespace arm_compute
 {
 namespace opencl
 {
 namespace kernels
 {

 ClSoftmaxKernel::ClSoftmaxKernel()
 {
 }

 Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(src, dst, info);

     ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);

     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);

     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
         &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);

     ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
     ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
                                 static_cast<int32_t>(src.num_dimensions()) <= info.axis);

     if (is_data_type_quantized_asymmetric(src.data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);

         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
                                     get_softmax_output_quantization_info(src.data_type(), info.is_log));
     }

     return Status{};
 }

 void ClSoftmaxKernel::configure(const CLCompileContext  &compile_context,
                                 const ITensorInfo       &src,
                                 ITensorInfo             &dst,
                                 const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(compile_context, src, dst, info);

     const auto &dst_shape = dst.tensor_shape();

     const auto data_type    = src.data_type();
     const auto element_size = src.element_size();

     const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
     const auto src_qinfo    = src.quantization_info().uniform();
     const auto dst_qinfo    = dst.quantization_info().uniform();

     const auto axis   = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
     const auto length = dst_shape[axis];

     const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;

     const auto vec_size          = adjust_vec_size(16 / element_size, dst_shape[0]);
     const auto vec_size_leftover = dst_shape[0] % vec_size;

     std::string    kernel_name("softmax");
     CLBuildOptions build_opts;

     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
     build_opts.add_option_if(info.is_log, "-DIS_LOG");
     build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));

     build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
     build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
     build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
     build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
     build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));

     if (axis == 0)
     {
         kernel_name += "_x";
         build_opts.add_option("-DSOFTMAX_X");

         if (is_quantized)
         {
             _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
         }
     }
     else
     {
         kernel_name += "_non_x";
         build_opts.add_option("-DSOFTMAX_NON_X");

         TensorShape tmp_shape;

         tmp_shape.set(0, length * vec_size, false);
         tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);

         for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
         {
             tmp_shape.set(i, dst_shape[i - 1], false);
         }

         for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
         {
             tmp_shape.set(i, dst_shape[i], false);
         }

         _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
     }

     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());

     // Configure kernel window and kernel arguments.
     Window win = calculate_max_window(src, Steps(vec_size));

     bool has_collapsed = true;

     win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
     win = win.collapse_if_possible(win, 2, has_collapsed);
     ARM_COMPUTE_ERROR_ON(!has_collapsed);

     ICLKernel::configure_internal(win);

     _axis = axis;

     _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
     _config_id += "_" + std::to_string(axis);
     _config_id += "_" + std::to_string(length);
 }

 void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

     const auto src =
         utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
     auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ICLTensor *tmp = (_tmp_info.total_size() > 0)
                          ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
                          : nullptr;

     if (!_prepared)
     {
         _prepared = true;

         const auto *src_info    = src->info();
         const auto *dst_info    = dst->info();
         auto        src_strides = src_info->strides_in_bytes();
         auto        dst_strides = dst_info->strides_in_bytes();

         const auto src_stride_axis = src_strides[_axis];
         const auto dst_stride_axis = dst_strides[_axis];

         // This axis has been removed from execution window, hence we remove it from the list of strides
         // provided to the kernel.
         // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
         src_strides.remove(_axis);
         dst_strides.remove(_axis);

         // Argument 0: src_ptr.
         _kernel.setArg<cl_uint>(1, src_strides[0]);
         _kernel.setArg<cl_uint>(2, src_strides[1]);
         _kernel.setArg<cl_uint>(3, src_strides[2]);
         _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());

         // Argument 5: dst_ptr.
         _kernel.setArg<cl_uint>(6, dst_strides[0]);
         _kernel.setArg<cl_uint>(7, dst_strides[1]);
         _kernel.setArg<cl_uint>(8, dst_strides[2]);
         _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());

         if (tmp != nullptr)
         {
             const auto *tmp_info    = tmp->info();
             const auto &tmp_strides = tmp_info->strides_in_bytes();

             // Argument 10: tmp_ptr.
             _kernel.setArg<cl_uint>(11, tmp_strides[1]);
             _kernel.setArg<cl_uint>(12, tmp_strides[2]);
             _kernel.setArg<cl_uint>(13, tmp_strides[3]);
             _kernel.setArg<cl_uint>(14, 0);
         }

         if (_axis > 0)
         {
             _kernel.setArg<cl_uint>(15, src_stride_axis);
             _kernel.setArg<cl_uint>(16, dst_stride_axis);
         }
     }

     _kernel.setArg(0, src->cl_buffer());
     _kernel.setArg(5, dst->cl_buffer());

     if (tmp != nullptr)
     {
         _kernel.setArg(10, tmp->cl_buffer());
     }

     enqueue(queue, *this, window, lws_hint());
 }

 const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
 {
     return _tmp_info;
 }

 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
	/*
	* Copyright (c) 2017-2021, 2023 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"

	#include "arm_compute/core/CL/CLCompileContext.h"
	#include "arm_compute/core/CL/CLHelpers.h"
	#include "arm_compute/core/CL/ICLTensor.h"
	#include "arm_compute/core/CL/OpenCL.h"
	#include "arm_compute/core/CoreTypes.h"
	#include "arm_compute/core/Dimensions.h"
	#include "arm_compute/core/Error.h"
	#include "arm_compute/core/Helpers.h"
	#include "arm_compute/core/ITensorInfo.h"
	#include "arm_compute/core/ITensorPack.h"
	#include "arm_compute/core/KernelDescriptors.h"
	#include "arm_compute/core/Steps.h"
	#include "arm_compute/core/TensorShape.h"
	#include "arm_compute/core/Utils.h"
	#include "arm_compute/core/utils/DataTypeUtils.h"
	#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
	#include "arm_compute/core/utils/StringUtils.h"
	#include "arm_compute/core/Validate.h"
	#include "arm_compute/core/Window.h"

	#include "src/core/helpers/WindowHelpers.h"
	#include "support/Cast.h"
	#include "support/StringSupport.h"

	#include <string>

	namespace arm_compute
	{
	namespace opencl
	{
	namespace kernels
	{

	ClSoftmaxKernel::ClSoftmaxKernel()
	{
	}

	Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
	{
	ARM_COMPUTE_UNUSED(src, dst, info);

	ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);

	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);

	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
	&src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);

	ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
	ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) \|\|
	static_cast<int32_t>(src.num_dimensions()) <= info.axis);

	if (is_data_type_quantized_asymmetric(src.data_type()))
	{
	ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);

	ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
	get_softmax_output_quantization_info(src.data_type(), info.is_log));
	}

	return Status{};
	}

	void ClSoftmaxKernel::configure(const CLCompileContext &compile_context,
	const ITensorInfo &src,
	ITensorInfo &dst,
	const SoftmaxKernelInfo &info)
	{
	ARM_COMPUTE_UNUSED(compile_context, src, dst, info);

	const auto &dst_shape = dst.tensor_shape();

	const auto data_type = src.data_type();
	const auto element_size = src.element_size();

	const auto is_quantized = data_type == DataType::QASYMM8 \|\| data_type == DataType::QASYMM8_SIGNED;
	const auto src_qinfo = src.quantization_info().uniform();
	const auto dst_qinfo = dst.quantization_info().uniform();

	const auto axis = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
	const auto length = dst_shape[axis];

	const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;

	const auto vec_size = adjust_vec_size(16 / element_size, dst_shape[0]);
	const auto vec_size_leftover = dst_shape[0] % vec_size;

	std::string kernel_name("softmax");
	CLBuildOptions build_opts;

	build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
	build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
	build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
	build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
	build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
	build_opts.add_option_if(info.is_log, "-DIS_LOG");
	build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));

	build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
	build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
	build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
	build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
	build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));

	if (axis == 0)
	{
	kernel_name += "_x";
	build_opts.add_option("-DSOFTMAX_X");

	if (is_quantized)
	{
	_tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
	}
	}
	else
	{
	kernel_name += "_non_x";
	build_opts.add_option("-DSOFTMAX_NON_X");

	TensorShape tmp_shape;

	tmp_shape.set(0, length * vec_size, false);
	tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);

	for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
	{
	tmp_shape.set(i, dst_shape[i - 1], false);
	}

	for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
	{
	tmp_shape.set(i, dst_shape[i], false);
	}

	_tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
	}

	_kernel = create_kernel(compile_context, kernel_name, build_opts.options());

	// Configure kernel window and kernel arguments.
	Window win = calculate_max_window(src, Steps(vec_size));

	bool has_collapsed = true;

	win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
	win = win.collapse_if_possible(win, 2, has_collapsed);
	ARM_COMPUTE_ERROR_ON(!has_collapsed);

	ICLKernel::configure_internal(win);

	_axis = axis;

	_config_id = "softmax_" + lower_string(string_from_data_type(data_type));
	_config_id += "_" + std::to_string(axis);
	_config_id += "_" + std::to_string(length);
	}

	void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
	{
	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

	const auto src =
	utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
	auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
	ICLTensor *tmp = (_tmp_info.total_size() > 0)
	? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
	: nullptr;

	if (!_prepared)
	{
	_prepared = true;

	const auto *src_info = src->info();
	const auto *dst_info = dst->info();
	auto src_strides = src_info->strides_in_bytes();
	auto dst_strides = dst_info->strides_in_bytes();

	const auto src_stride_axis = src_strides[_axis];
	const auto dst_stride_axis = dst_strides[_axis];

	// This axis has been removed from execution window, hence we remove it from the list of strides
	// provided to the kernel.
	// In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
	src_strides.remove(_axis);
	dst_strides.remove(_axis);

	// Argument 0: src_ptr.
	_kernel.setArg<cl_uint>(1, src_strides[0]);
	_kernel.setArg<cl_uint>(2, src_strides[1]);
	_kernel.setArg<cl_uint>(3, src_strides[2]);
	_kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());

	// Argument 5: dst_ptr.
	_kernel.setArg<cl_uint>(6, dst_strides[0]);
	_kernel.setArg<cl_uint>(7, dst_strides[1]);
	_kernel.setArg<cl_uint>(8, dst_strides[2]);
	_kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());

	if (tmp != nullptr)
	{
	const auto *tmp_info = tmp->info();
	const auto &tmp_strides = tmp_info->strides_in_bytes();

	// Argument 10: tmp_ptr.
	_kernel.setArg<cl_uint>(11, tmp_strides[1]);
	_kernel.setArg<cl_uint>(12, tmp_strides[2]);
	_kernel.setArg<cl_uint>(13, tmp_strides[3]);
	_kernel.setArg<cl_uint>(14, 0);
	}

	if (_axis > 0)
	{
	_kernel.setArg<cl_uint>(15, src_stride_axis);
	_kernel.setArg<cl_uint>(16, dst_stride_axis);
	}
	}

	_kernel.setArg(0, src->cl_buffer());
	_kernel.setArg(5, dst->cl_buffer());

	if (tmp != nullptr)
	{
	_kernel.setArg(10, tmp->cl_buffer());
	}

	enqueue(queue, *this, window, lws_hint());
	}

	const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
	{
	return _tmp_info;
	}

	} // namespace kernels
	} // namespace opencl
	} // namespace arm_compute