COMPMID-568: Implement Canny edge function for CL/NEON

Change-Id: Ic5f197463f962bac4b23663bcef7ac744be6fc2a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114250
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 5acb8e7..ed58345 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,17 +50,23 @@
       _visited(),
       _recorded(),
       _l1_list_counter(),
-      _l1_stack()
+      _l1_stack(),
+      _output(nullptr)
 {
 }
 
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
+                            uint8_t constant_border_value)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
     ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
 
+    _output = output;
+
     const unsigned int L1_hysteresis_stack_size = 8;
     const TensorShape  shape                    = input->info()->tensor_shape();
 
@@ -122,7 +128,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+        ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size);
     }
 
     // Manage intermediate buffers
@@ -187,6 +193,7 @@
     CLScheduler::get().enqueue(_non_max_suppr, false);
 
     // Clear temporary structures and run edge trace
+    _output->clear(CLScheduler::get().queue());
     _visited.clear(CLScheduler::get().queue());
     _recorded.clear(CLScheduler::get().queue());
     _l1_list_counter.clear(CLScheduler::get().queue());