COMPMID-1959: Implements 2D FFT on OpenCL

Change-Id: I73cf3984a5463acc854c8a59dc2bd9a5234cd99c
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/936
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index f102184..a4fcdc2 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -67,6 +67,8 @@
 #include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
 #include "arm_compute/runtime/CL/functions/CLErode.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFastCorners.h"
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index 1612cf7..029023c 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -28,6 +28,7 @@
 
 #include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
@@ -39,8 +40,9 @@
 
 /** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels:
  *
- * -# @ref CLFFTDigitReverseKernel Performs digit reverse
- * -# @ref CLFFTRadixStageKernel   A list of FFT kernels depending on the radix decomposition
+ * -# @ref CLFFTDigitReverseKernel Performs digit reverse.
+ * -# @ref CLFFTRadixStageKernel   A list of FFT kernels depending on the radix decomposition.
+ * -# @ref CLFFTScaleKernel        Performs output scaling in case of in inverse FFT.
  */
 class CLFFT1D : public IFunction
 {
@@ -69,11 +71,13 @@
 
 protected:
     CLMemoryGroup                            _memory_group;
-    CLTensor                                 _digit_reversed_input;
-    CLTensor                                 _digit_reverse_indices;
     CLFFTDigitReverseKernel                  _digit_reverse_kernel;
     std::unique_ptr<CLFFTRadixStageKernel[]> _fft_kernels;
+    CLFFTScaleKernel                         _scale_kernel;
+    CLTensor                                 _digit_reversed_input;
+    CLTensor                                 _digit_reverse_indices;
     unsigned int                             _num_ffts;
+    bool                                     _run_scale;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLFFT1D_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
new file mode 100644
index 0000000..a0673ec
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFFT2D_H__
+#define __ARM_COMPUTE_CLFFT2D_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ICLTensor;
+
+/** Basic function to execute two dimensional FFT. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFFT1D 1D FFT is performed on the first given axis
+ * -# @ref CLFFT1D 1D FFT is performed on the second given axis
+ */
+class CLFFT2D : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+     * @param[in]  config FFT related configuration
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D.
+     *
+     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
+     * @param[in] config FFT related configuration
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLMemoryGroup _memory_group;
+    CLFFT1D       _first_pass_func;
+    CLFFT1D       _second_pass_func;
+    CLTensor      _first_pass_tensor;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLFFT2D_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
new file mode 100644
index 0000000..0fd2cf3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Basic function to execute FFT-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref CLPermute                        Permute input if NHWC(only NCHW is supported).
+ *  -# @ref CLPadLayer                       Pad input.
+ *  -# @ref CLFFT2D                          Forward transform to the frequency domain.
+ *  -# @ref CLComplexPixelWiseMultiplication Complex element-wise product of input and the weights.
+ *  -# @ref CLReductionOperation             Reduction across channels.
+ *  -# @ref CLFFT2D                          Inverse transform back to the time domain.
+ *  -# @ref CLStridedSlice                   Extract valid output.
+ *  -# @ref CLArithmeticAddition             Add bias.
+ *  -# @ref CLActivationLayer                Perform activation.
+ *  -# @ref CLPermute                        Permute output if NHWC(only NCHW is supported).
+ */
+class CLFFTConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTConvolutionLayer(const CLFFTConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLFFTConvolutionLayer(CLFFTConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFFTConvolutionLayer &operator=(const CLFFTConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLFFTConvolutionLayer &operator=(CLFFTConvolutionLayer &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer
+     *
+     * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    CLMemoryGroup                    _memory_group;
+    CLReverse                        _flip_weights_func;
+    CLPermute                        _permute_input_func;
+    CLPermute                        _permute_output_func;
+    CLPermute                        _permute_weights_func;
+    CLPermute                        _permute_bias_func;
+    CLPadLayer                       _pad_input_func;
+    CLPadLayer                       _pad_weights_func;
+    CLFFT2D                          _transform_input_func;
+    CLFFT2D                          _transform_weights_func;
+    CLFFT2D                          _itransform_output_func;
+    CLComplexPixelWiseMultiplication _prod_func;
+    CLReductionOperation             _reduce_func;
+    CLSlice                          _extract_output_func;
+    CLArithmeticAddition             _bias_add_func;
+    CLActivationLayer                _activation_layer_func;
+
+    CLTensor _permuted_input;
+    CLTensor _permuted_weights;
+    CLTensor _permuted_bias;
+    CLTensor _permuted_output;
+    CLTensor _padded_input;
+    CLTensor _padded_weights;
+    CLTensor _flip_axis;
+    CLTensor _flipped_weights;
+    CLTensor _transformed_input;
+    CLTensor _transformed_weights;
+    CLTensor _input_weights_product;
+    CLTensor _output_product;
+    CLTensor _output_reduced;
+    CLTensor _itransformed_output;
+    CLTensor _reshaped_output;
+    CLTensor _bias_output;
+
+    const ICLTensor *_original_weights;
+    const ICLTensor *_original_bias;
+    bool             _is_activationlayer_enabled;
+    bool             _needs_permute;
+    bool             _has_bias;
+    bool             _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index a59fb4a..0fa40a7 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 
 namespace arm_compute
 {
+// Forward declaration
 class ICLTensor;
 
 /** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
@@ -64,5 +65,27 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 };
-}
+
+/** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */
+class CLComplexPixelWiseMultiplication : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2.
+     *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     *                        The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
+     *
+     * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__ */