COMPMID-345 - In-place computation for Activation Layer

Change-Id: I25ebfccc3d3e758cc8164e0b33805c0bb303891a
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78226
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index e3cbb6c..136191a 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -25,6 +25,8 @@
 
 /** This performs an activation function floating point inputs.
  *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
  * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
@@ -48,12 +50,20 @@
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void activation_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    TENSOR3D_DECLARATION(input)
+#if !defined IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif
+)
 {
     // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#if defined  IN_PLACE
+    Tensor3D output = input;
+#else
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, 16)
@@ -63,7 +73,7 @@
 #if defined LOGISTIC
     data = 1 / (1 + exp(-data));
 #elif defined TANH
-    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
+    data            = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
 #elif defined RELU
     data = max(0, data);
 #elif defined BRELU
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 83bbe6a..6439426 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -35,17 +35,24 @@
 
 using namespace arm_compute;
 
-void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+CLActivationLayerKernel::CLActivationLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
 
     // Set build options
     std::set<std::string> build_opts;
@@ -54,11 +61,55 @@
     build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.insert(("-DA=" + val_to_string(act_info.a())));
     build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+    build_opts.insert(output == nullptr ? "-DIN_PLACE" : "");
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
 
     // Make sure _kernel is initialized before calling the parent's configure
     constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                                  output_access);
+
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    ICLKernel::configure(win);
+}
+
+void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        if(_output != nullptr)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index a878078..1bd0353 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -41,21 +41,29 @@
 using namespace arm_compute;
 
 NEActivationLayerKernel::NEActivationLayerKernel()
-    : _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
+    : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
 {
 }
 
-void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input    = input;
+    _act_info = activation_info;
+    _output   = input;
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        _output = output;
+    }
 
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -85,9 +93,6 @@
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
     };
 
-    _input    = input;
-    _output   = output;
-    _act_info = activation_info;
     switch(input->info()->data_type())
     {
         case DataType::F32:
@@ -102,7 +107,27 @@
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                                  output_access);
+
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        // In-place computation
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    ICPPKernel::configure(win);
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
@@ -295,7 +320,7 @@
 void NEActivationLayerKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     (this->*_func)(window);
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9b5bd8b..0d0da0c 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,7 +28,7 @@
 
 using namespace arm_compute;
 
-void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
     auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
     k->configure(input, output, act_info);
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index f5d81d7..447ae64 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -28,7 +28,7 @@
 
 using namespace arm_compute;
 
-void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
     auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
     k->configure(input, output, activation_info);