COMPMID-1754: NEON: Implement Maximum, Minumum, SquaredDifference

Change-Id: I77e8c6a8af6ad841293ed5e66ed582035cc1424b
Reviewed-on: https://review.mlplatform.org/339
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 755e68a..57a1d4d 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -57,6 +57,7 @@
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
new file mode 100644
index 0000000..93ad437
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H__
+#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
+ *
+ */
+class NEElementwiseOperationKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEElementwiseOperationKernel";
+    }
+    /** Default constructor */
+    NEElementwiseOperationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseOperationKernel(const NEElementwiseOperationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseOperationKernel &operator=(const NEElementwiseOperationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEElementwiseOperationKernel(NEElementwiseOperationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEElementwiseOperationKernel &operator=(NEElementwiseOperationKernel &&) = default;
+    /** Default destructor */
+    ~NEElementwiseOperationKernel() = default;
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    /** Validate the argument passed to the kernel
+     *
+     * @param[in] input1 First tensor input. Data types supported: S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     */
+    virtual Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
+
+    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
+     *
+     */
+    template <ArithmeticOperation op>
+    void configure_common(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    ArithmeticOperation _op; // Code of the operation to execute
+
+private:
+    /** Common signature for all the specialised add functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: S16/F16/S32/F32
+     * @param[in]  input2 An input tensor. Data types supported: S16/F16/S32/F32
+     * @param[out] output The output tensor. Data types supported: S16/F16/S32/F32
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Add function to use for the particular tensor types passed to configure() */
+    ElementwiseFunction *_func;
+    const ITensor       *_input1;
+    const ITensor       *_input2;
+    ITensor             *_output;
+};
+
+class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
+{
+public:
+    NEArithmeticOperationKernel()
+        : NEElementwiseOperationKernel()
+    {
+    }
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+     *
+     * @param[in] op     Arithmetic operation to be executed.
+     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input. Data types supported: Same as @p input1.
+     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     */
+    void configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+     *
+     * @param[in] op     Arithmetic operation to be executed.
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input info. Data types supported: Same as @p input1
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a Status
+     */
+    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+protected:
+    // Inherited methods overridden:
+    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 77787af..46c8937 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -44,5 +44,6 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/padd.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/sub.h"
 
 #endif /* __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
index b5d9ed2..500ec78 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/load.h
@@ -67,7 +67,6 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VLOADQ_IMPL(float16_t, float16x8_t, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
 #undef VLOAD_IMPL
 } // namespace wrapper
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/arm_compute/core/NEON/wrapper/intrinsics/sub.h
new file mode 100644
index 0000000..8119429
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/sub.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_SUB_H__
+#define __ARM_COMPUTE_WRAPPER_SUB_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSUB_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vsub(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8)
+VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8)
+VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16)
+VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16)
+VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32)
+VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32)
+VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64)
+VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64)
+VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8)
+VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8)
+VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16)
+VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16)
+VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32)
+VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32)
+VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64)
+VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64)
+VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef vsub_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_SUB_H__ */
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index f95058c..a9e9a71 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,7 @@
 /** Rounding method */
 enum class RoundingPolicy
 {
-    TO_ZERO,         /**< Truncates the least significand values that are lost in operations. */
+    TO_ZERO,         /**< Truncates the least significant values that are lost in operations. */
     TO_NEAREST_UP,   /**< Rounds to nearest value; half rounds away from zero */
     TO_NEAREST_EVEN, /**< Rounds to nearest value; half rounds to nearest even */
 };