COMPMID-1694: Fuse offset contribution with the output stage when we use NEGEMMLowpMatrixMultiplyCore

Change-Id: Ic1a681e4cc03e1eba3bf8485d9cdb17b3e926047
Signed-off-by: giuros01 <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/561
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 836f8ed..90a4b5c 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -42,88 +42,166 @@
 {
 namespace validation
 {
+namespace
+{
+template <typename U>
+void fill(U &&tensor, int i)
+{
+    // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+    std::uniform_int_distribution<> distribution(1, 254);
+    library->fill(tensor, distribution, i);
+}
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false>
+TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo())
+{
+    // Create tensors
+    TensorType a      = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1);
+    TensorType b      = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1);
+    TensorType output = create_tensor<TensorType>(shape_output, output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : DataType::QASYMM8, 1);
+
+    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+
+    TensorType bias;
+    if(is_fused)
+    {
+        TensorShape bias_shape(shape_b[0]);
+        bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+    }
+
+    // Create and configure function
+    // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
+    FunctionType gemmlowp;
+    // TODO (COMPMID-1672) - Extending the test to validate add bias in offset contribution
+    gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, output_stage));
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Allocate tensors
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    output.allocator()->allocate();
+
+    ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Fill tensors
+    fill(AccessorType(a), 0);
+    fill(AccessorType(b), 1);
+
+    if(is_fused)
+    {
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        bias.allocator()->allocate();
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        fill(AccessorType(bias), 2);
+    }
+
+    // Compute GEMM function
+    gemmlowp.run();
+    return output;
+}
+
+template <bool        reinterpret_input_as_3d>
+SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+{
+    TensorShape shape_a_to_use = shape_a;
+    if(reinterpret_input_as_3d)
+    {
+        // Collapse the second and third dimension if the input is 3D
+        shape_a_to_use.collapse(2U, 1U);
+    }
+
+    // Create reference
+    SimpleTensor<uint8_t> a{ shape_a_to_use, DataType::QASYMM8, 1 };
+    SimpleTensor<uint8_t> b{ shape_b, DataType::QASYMM8, 1 };
+
+    // Fill reference
+    fill(a, 0);
+    fill(b, 1);
+
+    return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t>(a, b, shape_output, a_offset, b_offset);
+}
+}
+
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false>
 class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
     {
-        _target    = compute_target(shape_a, shape_b, shape_c, a_offset, b_offset);
-        _reference = compute_reference(shape_a, shape_b, shape_c, a_offset, b_offset);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset);
     }
 
 protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
     {
-        // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
-        std::uniform_int_distribution<> distribution(1, 254);
-        library->fill(tensor, distribution, i);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t>(shape_a, shape_b, shape_output, a_offset, b_offset);
     }
 
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, int32_t a_offset, int32_t b_offset)
+    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
     {
-        // Create tensors
-        TensorType a = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1);
-        TensorType b = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1);
-        TensorType c = create_tensor<TensorType>(shape_c, DataType::S32, 1);
-
-        a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
-        b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
-
-        // Create and configure function
-        // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
-        FunctionType gemmlowp;
-        // TODO (COMPMID-1672) - Extending the test to validate add bias in offset contribution
-        gemmlowp.configure(&a, &b, nullptr, &c, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? shape_c[2] : 0), reinterpret_input_as_3d));
-
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        a.allocator()->allocate();
-        b.allocator()->allocate();
-        c.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(a), 0);
-        fill(AccessorType(b), 1);
-
-        // Compute GEMM function
-        gemmlowp.run();
-        return c;
-    }
-
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, int32_t a_offset, int32_t b_offset)
-    {
-        TensorShape shape_a_to_use = shape_a;
-        if(reinterpret_input_as_3d)
-        {
-            // Collapse the second and third dimension if the input is 3D
-            shape_a_to_use.collapse(2U, 1U);
-        }
-
-        // Create reference
-        SimpleTensor<uint8_t> a{ shape_a_to_use, DataType::QASYMM8, 1 };
-        SimpleTensor<uint8_t> b{ shape_b, DataType::QASYMM8, 1 };
-
-        // Fill reference
-        fill(a, 0);
-        fill(b, 1);
-
-        return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t>(a, b, shape_c, a_offset, b_offset);
+        return compute_gemmlowp_reference<reinterpret_input_as_3d>(shape_a, shape_b, shape_output, a_offset, b_offset);
     }
 
     TensorType            _target{};
     SimpleTensor<int32_t> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false>
+class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    {
+        ARM_COMPUTE_EXPECT(output_stage.type != GEMMLowpOutputStageType::NONE, framework::LogLevel::ERRORS);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage);
+    }
+
+protected:
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    {
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true>(shape_a, shape_b, shape_output, a_offset, b_offset,
+                output_stage);
+    }
+
+    SimpleTensor<qasymm8_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
+                                              GEMMLowpOutputStageInfo output_stage)
+    {
+        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d>(shape_a, shape_b, shape_output, a_offset, b_offset);
+
+        TensorShape           bias_shape(shape_b[0]);
+        SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
+        fill(bias, 2);
+
+        switch(output_stage.type)
+        {
+            case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+                return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(output, bias,
+                                                                                       output_stage.gemmlowp_offset, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+                break;
+            case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+                return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(output, bias,
+                                                                                                     output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not Supported!");
+        }
+    }
+
+    TensorType              _target{};
+    SimpleTensor<qasymm8_t> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework::Fixture
 {
@@ -536,4 +614,4 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */
\ No newline at end of file
+#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */