blob: 796345a9237fb9e72d5ac797669b5acdc5131a83 [file] [log] [blame]
/*
* Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/CoreTypes.h"
#include "arm_compute/core/Dimensions.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Steps.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/DataTypeUtils.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/Cast.h"
#include "support/StringSupport.h"
#include <string>
namespace arm_compute
{
namespace opencl
{
namespace kernels
{
ClSoftmaxKernel::ClSoftmaxKernel()
{
}
Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
{
ARM_COMPUTE_UNUSED(src, dst, info);
ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
&src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
static_cast<int32_t>(src.num_dimensions()) <= info.axis);
if (is_data_type_quantized_asymmetric(src.data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
get_softmax_output_quantization_info(src.data_type(), info.is_log));
}
return Status{};
}
void ClSoftmaxKernel::configure(const CLCompileContext &compile_context,
const ITensorInfo &src,
ITensorInfo &dst,
const SoftmaxKernelInfo &info)
{
ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
const auto &dst_shape = dst.tensor_shape();
const auto data_type = src.data_type();
const auto element_size = src.element_size();
const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
const auto src_qinfo = src.quantization_info().uniform();
const auto dst_qinfo = dst.quantization_info().uniform();
const auto axis = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
const auto length = dst_shape[axis];
const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
const auto vec_size = adjust_vec_size(16 / element_size, dst_shape[0]);
const auto vec_size_leftover = dst_shape[0] % vec_size;
std::string kernel_name("softmax");
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
build_opts.add_option_if(info.is_log, "-DIS_LOG");
build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
if (axis == 0)
{
kernel_name += "_x";
build_opts.add_option("-DSOFTMAX_X");
if (is_quantized)
{
_tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
}
}
else
{
kernel_name += "_non_x";
build_opts.add_option("-DSOFTMAX_NON_X");
TensorShape tmp_shape;
tmp_shape.set(0, length * vec_size, false);
tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
{
tmp_shape.set(i, dst_shape[i - 1], false);
}
for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
{
tmp_shape.set(i, dst_shape[i], false);
}
_tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
}
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure kernel window and kernel arguments.
Window win = calculate_max_window(src, Steps(vec_size));
bool has_collapsed = true;
win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
win = win.collapse_if_possible(win, 2, has_collapsed);
ARM_COMPUTE_ERROR_ON(!has_collapsed);
ICLKernel::configure_internal(win);
_axis = axis;
_config_id = "softmax_" + lower_string(string_from_data_type(data_type));
_config_id += "_" + std::to_string(axis);
_config_id += "_" + std::to_string(length);
}
void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
const auto src =
utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
ICLTensor *tmp = (_tmp_info.total_size() > 0)
? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
: nullptr;
if (!_prepared)
{
_prepared = true;
const auto *src_info = src->info();
const auto *dst_info = dst->info();
auto src_strides = src_info->strides_in_bytes();
auto dst_strides = dst_info->strides_in_bytes();
const auto src_stride_axis = src_strides[_axis];
const auto dst_stride_axis = dst_strides[_axis];
// This axis has been removed from execution window, hence we remove it from the list of strides
// provided to the kernel.
// In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
src_strides.remove(_axis);
dst_strides.remove(_axis);
// Argument 0: src_ptr.
_kernel.setArg<cl_uint>(1, src_strides[0]);
_kernel.setArg<cl_uint>(2, src_strides[1]);
_kernel.setArg<cl_uint>(3, src_strides[2]);
_kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
// Argument 5: dst_ptr.
_kernel.setArg<cl_uint>(6, dst_strides[0]);
_kernel.setArg<cl_uint>(7, dst_strides[1]);
_kernel.setArg<cl_uint>(8, dst_strides[2]);
_kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
if (tmp != nullptr)
{
const auto *tmp_info = tmp->info();
const auto &tmp_strides = tmp_info->strides_in_bytes();
// Argument 10: tmp_ptr.
_kernel.setArg<cl_uint>(11, tmp_strides[1]);
_kernel.setArg<cl_uint>(12, tmp_strides[2]);
_kernel.setArg<cl_uint>(13, tmp_strides[3]);
_kernel.setArg<cl_uint>(14, 0);
}
if (_axis > 0)
{
_kernel.setArg<cl_uint>(15, src_stride_axis);
_kernel.setArg<cl_uint>(16, dst_stride_axis);
}
}
_kernel.setArg(0, src->cl_buffer());
_kernel.setArg(5, dst->cl_buffer());
if (tmp != nullptr)
{
_kernel.setArg(10, tmp->cl_buffer());
}
enqueue(queue, *this, window, lws_hint());
}
const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
{
return _tmp_info;
}
} // namespace kernels
} // namespace opencl
} // namespace arm_compute