Requantization cases for offset changes only

Resolves: [COMPMID-6681]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I325b9d478dd1d04a45533bb7708cf76e98ee0cee
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11058
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 5dde680..d2ac6cf 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,6 +104,18 @@
     return vquantize_signed(qv, qi);
 }
 
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
 } // namespace
 
 void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
@@ -120,6 +132,19 @@
         {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
         {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
 
+        // Functions for offset only requantization
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED",
+         &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>},
+
+        // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
+         &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>},
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
+         &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>},
+
         {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
 
         {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
@@ -134,6 +159,26 @@
     };
 
     std::string function_to_call("op_");
+
+    // For offset only functions - must be 8-bit and have identical scale values.
+    if (src->quantization_info().scale() == dst->quantization_info().scale() &&
+        (is_data_type_quantized_asymmetric_char(src->data_type()) &&
+         is_data_type_quantized_asymmetric_char(dst->data_type())))
+    {
+        function_to_call += "OFFSET_ONLY_";
+        // For optimized datatype conversion 8-bit re-quantization offset only functions.
+        // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case.
+        auto uqinfo =
+            compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform());
+        const auto src_dt = src->data_type();
+        if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) ||
+                                                     (src_dt == DataType::QASYMM8 && uqinfo.offset == -128)))
+        {
+            function_to_call += "CONVERT_";
+        }
+    }
+
+    // Specify datatype for function
     function_to_call += string_from_data_type(src->data_type()) + "_";
     function_to_call += string_from_data_type(dst->data_type());
 
@@ -145,9 +190,11 @@
     }
     _func = it->second;
 
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
+    // Calculate window. Squash if possible.
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+
+    ICpuKernel::configure(win);
 }
 
 Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
@@ -164,10 +211,8 @@
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
     // Collapse window and reset first dimension to handle tail calculations manually
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -195,6 +240,114 @@
 }
 
 template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Calculate output offset difference.
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+                    wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+                // Signed addition.
+                auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+                // Output is dependent on datatype.
+                wrapper::vstore(&output_ptr[x],
+                                reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto result   = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    const int32_t low_bound   = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+    const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto  input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const auto qv    = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+                int16x8_t  lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+                int16x8_t  upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+                // Signed addition.
+                lower = wrapper::vqadd(lower, offset);
+                upper = wrapper::vqadd(upper, offset);
+
+                // Output is dependent on datatype.
+                auto res = recombine_8_16<TOut>(lower, upper);
+                wrapper::vstore(&output_ptr[x], res);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Add offset and clamp result to within the range of the output datatype.
+                int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                result         = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+                // Cast result to output datatype.
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
 void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -302,6 +455,7 @@
 {
     return "CpuQuantizeKernel";
 }
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute