blob: dbc1dde4fa4876b50ff4337990cfc5b69b9af54d [file] [log] [blame]
/*
* Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
{
namespace cpu
{
template <typename ScalarType>
inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
{
switch(op)
{
case ElementWiseUnary::RSQRT:
return 1 / sqrt(a);
case ElementWiseUnary::EXP:
return std::exp(a);
case ElementWiseUnary::NEG:
return -a;
case ElementWiseUnary::LOG:
return std::log(a);
case ElementWiseUnary::ABS:
return std::abs(a);
case ElementWiseUnary::ROUND:
return support::cpp11::nearbyint(a);
case ElementWiseUnary::SIN:
return std::sin(a);
default:
ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
}
}
template <typename ScalarType, typename VectorType>
inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
{
switch(op)
{
case ElementWiseUnary::RSQRT:
return wrapper::vinvsqrt(a);
case ElementWiseUnary::EXP:
return wrapper::vexpq(a);
case ElementWiseUnary::NEG:
return wrapper::vneg(a);
case ElementWiseUnary::LOG:
return wrapper::vlog(a);
case ElementWiseUnary::ABS:
return wrapper::vabs(a);
case ElementWiseUnary::ROUND:
return wrapper::vround(a);
case ElementWiseUnary::SIN:
return wrapper::vsin(a);
default:
ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
}
}
template <typename ScalarType>
inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
{
const int window_step_x = 16 / sizeof(ScalarType);
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
int x = window_start_x;
for(; x <= window_end_x - window_step_x; x += window_step_x)
{
wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
}
for(; x < window_end_x; ++x)
{
*(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
}
},
input, output);
}
template <>
inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
{
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const UniformQuantizationInfo qi_in = in->info()->quantization_info().uniform();
const UniformQuantizationInfo qi_out = out->info()->quantization_info().uniform();
const auto min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale);
const auto max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale);
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
int8x16_t vout;
auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
const auto vconst_0_f32 = vdupq_n_f32(0);
auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
int x = window_start_x;
for(; x <= window_end_x - window_step_x; x += window_step_x)
{
const auto vin = wrapper::vloadq(input_ptr + x);
// De-quantize
const auto vin_deq = vdequantize(vin, qi_in);
// Perform activation
float32x4x4_t vtmp_deq =
{
{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
}
};
if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
{
vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
// Re-quantize to new output space
vout = vquantize_signed(vtmp_deq, qi_out);
wrapper::vstore(output_ptr + x, vout);
}
for(; x < window_end_x; ++x)
{
qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
qasymm8_signed_t tmp = 0;
float tmp_f = dequantize_qasymm8_signed(in, qi_in);
if(tmp_f <= 0.0)
{
if(op == ElementWiseUnary::LOG)
{
tmp_f = (-128 - qi_out.offset) * qi_out.scale;
}
else if(op == ElementWiseUnary::RSQRT)
{
tmp_f = (127 - qi_out.offset) * qi_out.scale;
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
// For aarch64 LUT is used and rounding to nearest is used
*(output_ptr + x) = tmp;
}
},
input, output);
}
template <>
inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
{
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const UniformQuantizationInfo qi_in = in->info()->quantization_info().uniform();
const UniformQuantizationInfo qi_out = out->info()->quantization_info().uniform();
const auto vconst_0_f32 = vdupq_n_f32(0);
const auto min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale);
const auto max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale);
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
uint8x16_t vout;
auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
int x = window_start_x;
for(; x <= window_end_x - window_step_x; x += window_step_x)
{
const auto vin = wrapper::vloadq(input_ptr + x);
// De-quantize
const auto vin_deq = vdequantize(vin, qi_in);
// Perform activation
float32x4x4_t vtmp_deq =
{
{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
}
};
if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
{
vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
// Re-quantize to new output space
vout = vquantize(vtmp_deq, qi_out);
wrapper::vstore(output_ptr + x, vout);
}
for(; x < window_end_x; ++x)
{
qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
qasymm8_t tmp = 0;
float tmp_f = dequantize_qasymm8(in, qi_in);
if(tmp_f <= 0.0)
{
if(op == ElementWiseUnary::LOG)
{
tmp_f = (0 - qi_out.offset) * qi_out.scale;
}
else if(op == ElementWiseUnary::RSQRT)
{
tmp_f = (255 - qi_out.offset) * qi_out.scale;
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
*(output_ptr + x) = tmp;
}
},
input, output);
}
} // namespace cpu
} // namespace arm_compute
#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H