COMPMID-477 - Optimized batched case in CLConvolutionLayer

Change-Id: I4ef18f49f1da0cb816aaa0762466b940792c15ed
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/84162
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index dec63e0..a768a19 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -30,10 +30,10 @@
 {
 class ICLTensor;
 
-/** OpenCL kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
+/** OpenCL kernel to multiply two input matrices "A" and "B" . All elements of the output matrix will be multiplied by alpha
  *
- * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel
- * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel,
+ *       the flag @p is_interleaved_transposed must be set to true
  *
  * @attention The second input tensor must have at least 2 dimensions (matrix)
  *
@@ -53,13 +53,13 @@
     CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
     /** Initialise the kernel's input, output and alpha
      *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
-     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha  Weight of the matrix product
+     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha                     Weight of the matrix product
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index a29f68f..e076f51 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -24,12 +24,10 @@
 #ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
 #define __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
 
-#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -38,41 +36,25 @@
 {
 /** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
  *
- *  -# @ref CLTransposeKernel        (if @p transpose_weights is set to true)
- *  -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *  -# @ref CLTransposeKernel
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
-class CLFullyConnectedLayerReshapeWeights : public IFunction
+class CLFullyConnectedLayerReshapeWeights : public ICLSimpleFunction
 {
 public:
-    /** Constructor */
-    CLFullyConnectedLayerReshapeWeights();
     /** Set the input and output tensors.
      *
-     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32.
-     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
-     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32.
+     * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLTransposeKernel        _transpose_kernel;
-    CLGEMMTranspose1xWKernel _transpose1xW_kernel;
-    CLTensor                 _transpose_output;
-    bool                     _transpose_weights;
-    bool                     _is_batched_fc_layer;
+    void configure(const ICLTensor *input, ICLTensor *output);
 };
 
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once)
- *  -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMMatrixMultiplyKernel
  *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
  *
@@ -85,7 +67,7 @@
     CLFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input                Source tensor. Data type supported: QS8/F16/F32.
+     * @param[in]  input                Source tensor. Data type supported: QS8/QS16/F16/F32.
      * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
      * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
      * @param[out] output               Destination tensor. Data type supported: Same as @p input.
@@ -98,17 +80,17 @@
     void run() override;
 
 private:
+    void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+
     CLIm2ColKernel                      _im2col_kernel;
     CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
-    CLGEMMInterleave4x4Kernel           _interleave4x4_kernel;
     CLGEMMMatrixMultiplyKernel          _mm_kernel;
     CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
     CLTensor                            _im2col_output;
-    CLTensor                            _interleave4x4_output;
     CLTensor                            _reshape_weights_output;
     bool                                _are_weights_reshaped;
-    bool                                _is_batched_fc_layer;
-    bool                                _linearize_input;
+    bool                                _is_fc_after_conv;
     bool                                _accumulate_biases;
 };
 }
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 9207efd..9b88730 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -76,7 +76,7 @@
     CLGEMMMatrixAdditionKernel _ma_kernel;
     CLTensor                   _tmp_a;
     CLTensor                   _tmp_b;
-    bool                       _run_vector_matrix_multiplication;
+    bool                       _is_interleaved_transposed;
     bool                       _run_addition;
 };
 }