COMPMID-568: Implement Canny edge function for CL/NEON

Change-Id: Ic5f197463f962bac4b23663bcef7ac744be6fc2a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114250
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
index 1d5a5aa..13b31b2 100644
--- a/arm_compute/runtime/CL/functions/CLCannyEdge.h
+++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,10 @@
 public:
     /** Constructor */
     CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCannyEdge(const CLCannyEdge &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCannyEdge &operator=(const CLCannyEdge &) = delete;
     /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
@@ -63,8 +67,8 @@
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
+    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
+                   uint8_t constant_border_value = 0);
 
     // Inherited methods overridden:
     virtual void run() override;
@@ -82,6 +86,7 @@
     CLImage                       _phase;                                           /**< Source tensor - Phase. */
     CLImage                       _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
     CLImage                       _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
+    ICLTensor                    *_output;                                          /**< Output tensor provided by the user. */
 };
 }
 
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
index 166d681..f60359f 100644
--- a/src/core/CL/cl_kernels/canny.cl
+++ b/src/core/CL/cl_kernels/canny.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,6 +148,9 @@
     vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
 }
 
+#define EDGE 255
+#define NO_EDGE 0
+
 /** Array that holds the relative coordinates offset for the neighbouring pixels.
  */
 __constant short4 neighbours_coords[] =
@@ -203,6 +206,7 @@
     DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
     uchar an              = convert_ushort(*angle.ptr);
 
+    // Early return if not greater than lower threshold
     if(gradient <= lower_thr)
     {
         return;
@@ -224,7 +228,6 @@
     }
 }
 
-#define EDGE 255
 #define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
 #define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
 
@@ -333,7 +336,7 @@
     // If less than upper threshold set to NO_EDGE and return
     if(val <= up_thr)
     {
-        *offset(&out, x, y) = 0;
+        *offset(&out, x, y) = NO_EDGE;
         return;
     }
 
@@ -372,7 +375,7 @@
         // Get direction pixel indices
         int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
 
-        // Check 8 pixels around for week edges where low_thr < val <= up_thr
+        // Check 8 pixels around for weak edges where low_thr < val <= up_thr
         x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
         v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
         check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 9dfd580..dc37452 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -567,29 +567,29 @@
     const uint32x4_t mk0_0 = vld1q_u32(in - 1);
     const uint32x4_t mk0_1 = vld1q_u32(in + 1);
     uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
-    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
-    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
 
     // 45 degree
     const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
     const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
     uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
-    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
-    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
 
     // 90 degree
     const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
     const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
     uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
-    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
-    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
 
     // 135 degree
     const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
     const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
     uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
-    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
-    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
 
     // Merge masks
     mask0 = vorrq_u32(mask0, mask1);
@@ -1338,29 +1338,29 @@
     const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
     const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
     uint16x8_t       mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
-    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_0));
-    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_1));
+    mask0                  = vandq_u16(mask0, vcgtq_u16(mc, mk0_0));
+    mask0                  = vandq_u16(mask0, vcgtq_u16(mc, mk0_1));
 
     // 45 degree
     const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
     const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
     uint16x8_t       mask1  = vceqq_u16(pc16, vdupq_n_u16(1));
-    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_0));
-    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_1));
+    mask1                   = vandq_u16(mask1, vcgtq_u16(mc, mk45_0));
+    mask1                   = vandq_u16(mask1, vcgtq_u16(mc, mk45_1));
 
     // 90 degree
     const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
     const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
     uint16x8_t       mask2  = vceqq_u16(pc16, vdupq_n_u16(2));
-    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_0));
-    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_1));
+    mask2                   = vandq_u16(mask2, vcgtq_u16(mc, mk90_0));
+    mask2                   = vandq_u16(mask2, vcgtq_u16(mc, mk90_1));
 
     // 135 degree
     const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
     const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
     uint16x8_t       mask3   = vceqq_u16(pc16, vdupq_n_u16(3));
-    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_0));
-    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_1));
+    mask3                    = vandq_u16(mask3, vcgtq_u16(mc, mk135_0));
+    mask3                    = vandq_u16(mask3, vcgtq_u16(mc, mk135_1));
 
     // Merge masks
     mask0 = vorrq_u16(mask0, mask1);
@@ -1399,29 +1399,29 @@
     const uint32x4_t mk0_0 = vld1q_u32(input - 1);
     const uint32x4_t mk0_1 = vld1q_u32(input + 1);
     uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
-    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
-    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
 
     // 45 degree
     const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
     const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
     uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
-    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
-    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
 
     // 90 degree
     const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
     const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
     uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
-    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
-    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
 
     // 135 degree
     const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
     const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
     uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
-    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
-    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
 
     // Merge masks
     mask0 = vorrq_u32(mask0, mask1);
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 5acb8e7..ed58345 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,17 +50,23 @@
       _visited(),
       _recorded(),
       _l1_list_counter(),
-      _l1_stack()
+      _l1_stack(),
+      _output(nullptr)
 {
 }
 
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
+                            uint8_t constant_border_value)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
     ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
 
+    _output = output;
+
     const unsigned int L1_hysteresis_stack_size = 8;
     const TensorShape  shape                    = input->info()->tensor_shape();
 
@@ -122,7 +128,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+        ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size);
     }
 
     // Manage intermediate buffers
@@ -187,6 +193,7 @@
     CLScheduler::get().enqueue(_non_max_suppr, false);
 
     // Clear temporary structures and run edge trace
+    _output->clear(CLScheduler::get().queue());
     _visited.clear(CLScheduler::get().queue());
     _recorded.clear(CLScheduler::get().queue());
     _l1_list_counter.clear(CLScheduler::get().queue());
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index c27ff2f..1d73148 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,12 +61,12 @@
 void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
                             bool use_fp16)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(gradient_size < 3);
-    ARM_COMPUTE_ERROR_ON(gradient_size > 7);
-    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
     ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
 
     _output = output;
 
@@ -119,7 +119,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient size not supported\n");
+        ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size);
     }
 
     // Manage intermediate buffers
@@ -171,24 +171,23 @@
 void NECannyEdge::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-    ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
     _memory_group.acquire();
 
     // Run sobelNxN
     _sobel->run();
 
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
-
     // Run gradient
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
 
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
+
     // Run non-maxima suppression
     NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    memset(_output->buffer(), 0, _output->info()->total_size());
+    std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
 
     // Fill border before edge trace
     NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
diff --git a/tests/SimpleTensorPrinter.h b/tests/SimpleTensorPrinter.h
new file mode 100644
index 0000000..905a156
--- /dev/null
+++ b/tests/SimpleTensorPrinter.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include "tests/RawTensor.h"
+#include "tests/SimpleTensor.h"
+
+#include <iostream>
+#include <sstream>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+template <typename T>
+inline std::string prettify_tensor(const SimpleTensor<T> &input, const IOFormatInfo &io_fmt = IOFormatInfo{ IOFormatInfo::PrintRegion::NoPadding })
+{
+    ARM_COMPUTE_ERROR_ON(input.data() == nullptr);
+
+    RawTensor tensor(std::move(SimpleTensor<T>(input)));
+
+    TensorInfo info(tensor.shape(), tensor.num_channels(), tensor.data_type());
+
+    const DataType    dt           = info.data_type();
+    const size_t      slices2D     = info.tensor_shape().total_size_upper(2);
+    const Strides     strides      = info.strides_in_bytes();
+    const PaddingSize padding      = info.padding();
+    const size_t      num_channels = info.num_channels();
+
+    std::ostringstream os;
+
+    // Set precision
+    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    {
+        int precision = io_fmt.precision;
+        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        {
+            precision = std::numeric_limits<float>().max_digits10;
+        }
+        os.precision(precision);
+    }
+
+    // Define region to print
+    size_t print_width  = 0;
+    size_t print_height = 0;
+    int    start_offset = 0;
+    switch(io_fmt.print_region)
+    {
+        case IOFormatInfo::PrintRegion::NoPadding:
+            print_width  = info.dimension(0);
+            print_height = info.dimension(1);
+            start_offset = info.offset_first_element_in_bytes();
+            break;
+        case IOFormatInfo::PrintRegion::ValidRegion:
+            print_width  = info.valid_region().shape.x();
+            print_height = info.valid_region().shape.y();
+            start_offset = info.offset_element_in_bytes(Coordinates(info.valid_region().anchor.x(),
+                                                                    info.valid_region().anchor.y()));
+            break;
+        case IOFormatInfo::PrintRegion::Full:
+            print_width  = padding.left + info.dimension(0) + padding.right;
+            print_height = padding.top + info.dimension(1) + padding.bottom;
+            start_offset = static_cast<int>(info.offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            break;
+        default:
+            break;
+    }
+
+    print_width = print_width * num_channels;
+
+    // Set pointer to start
+    const uint8_t *ptr = tensor.data() + start_offset;
+
+    // Start printing
+    for(size_t i = 0; i < slices2D; ++i)
+    {
+        // Find max_width of elements in slice to align columns
+        int max_element_width = 0;
+        if(io_fmt.align_columns)
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(os, dt, ptr + offset, print_width));
+                offset += strides[1];
+            }
+        }
+
+        // Print slice
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                print_consecutive_elements(os, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
+                offset += strides[1];
+                os << io_fmt.row_delim;
+            }
+            os << io_fmt.row_delim;
+        }
+    }
+
+    return os.str();
+}
+
+template <typename T>
+inline std::ostream &operator<<(std::ostream &os, const SimpleTensor<T> &tensor)
+{
+    os << prettify_tensor(tensor, IOFormatInfo{ IOFormatInfo::PrintRegion::NoPadding });
+    return os;
+}
+
+template <typename T>
+inline std::string to_string(const SimpleTensor<T> &tensor)
+{
+    std::stringstream ss;
+    ss << tensor;
+    return ss.str();
+}
+
+#if PRINT_TENSOR_LIMIT
+template <typename T>
+void print_simpletensor(const SimpleTensor<T> &tensor, const std::string &title, const IOFormatInfo::PrintRegion &region = IOFormatInfo::PrintRegion::NoPadding)
+{
+    if(tensor.num_elements() < PRINT_TENSOR_LIMIT)
+    {
+        std::cout << title << ":" << std::endl;
+        std::cout << prettify_tensor(tensor, IOFormatInfo{ region });
+    }
+}
+#endif // PRINT_TENSOR_LIMIT
+}
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/CannyEdge.cpp b/tests/validation/CL/CannyEdge.cpp
new file mode 100644
index 0000000..7aa178a
--- /dev/null
+++ b/tests/validation/CL/CannyEdge.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/CLArrayAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/ImageFileDatasets.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CannyEdgeFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
+const float allowed_mismatch_ratio = 0.1f;
+
+const auto use_fp16 = framework::dataset::make("UseFP16", { false });
+
+const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
+                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), combine(datasets::BorderModes(), use_fp16)));
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(CannyEdge)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
+               shape, gradient_size, normalization, border_mode, use_fp16, format)
+{
+    ARM_COMPUTE_UNUSED(use_fp16);
+    ARM_COMPUTE_ERROR_ON(use_fp16);
+
+    CannyEdgeParameters params = canny_edge_parameters();
+    // Convert normalisation type to integer
+    const auto norm_type = static_cast<int>(normalization) + 1;
+
+    // Create tensors
+    CLTensor src = create_tensor<CLTensor>(shape, data_type_from_format(format));
+    CLTensor dst = create_tensor<CLTensor>(shape, data_type_from_format(format));
+    src.info()->set_format(format);
+    dst.info()->set_format(format);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create Canny edge configure function
+    CLCannyEdge canny_edge;
+    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
+
+    // Validate valid region
+    validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
+
+    //TODO(COMPMID-568): dst region validation fails when Shape=7x7 and GradientSize=7 and BorderMode=UNDEFINED (integer underflow)
+    if(!(shape == TensorShape{ 7u, 7u } && gradient_size == 7 && border_mode == BorderMode::UNDEFINED))
+    {
+        validate(dst.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode), BorderSize(gradient_size / 2 + 1)));
+    }
+
+    // Validate padding
+    PaddingCalculator calculator(shape.x(), 1);
+    calculator.set_border_mode(border_mode);
+    calculator.set_border_size(1);
+    const PaddingSize dst_padding = calculator.required_padding();
+
+    calculator.set_border_size(gradient_size / 2);
+    calculator.set_access_offset(-gradient_size / 2);
+    calculator.set_accessed_elements(16);
+    calculator.set_processed_elements(8);
+    const PaddingSize src_padding = calculator.required_padding();
+
+    validate(src.info()->padding(), src_padding);
+    validate(dst.info()->padding(), dst_padding);
+}
+
+template <typename T>
+using CLCannyEdgeFixture = CannyEdgeValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLCannyEdge, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLCannyEdgeFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format", Format::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLCannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 25dc6c5..521cc57 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -131,6 +131,22 @@
     return params;
 }
 
+CannyEdgeParameters canny_edge_parameters()
+{
+    CannyEdgeParameters params;
+
+    std::mt19937                           gen(library->seed());
+    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
+    std::uniform_int_distribution<uint8_t> threshold_dist(1, 255);
+
+    params.constant_border_value = int_dist(gen);
+    params.upper_thresh          = threshold_dist(gen); // upper_threshold >= 1
+    threshold_dist               = std::uniform_int_distribution<uint8_t>(0, params.upper_thresh);
+    params.lower_thresh          = threshold_dist(gen);
+
+    return params;
+}
+
 SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<uint8_t> &src)
 {
     const QuantizationInfo &quantization_info = src.quantization_info();
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index d07803f..76b3e2f 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -168,6 +168,17 @@
 /** Generate parameters for Harris Corners algorithm. */
 HarrisCornersParameters harris_corners_parameters();
 
+/** Parameters of Canny edge algorithm. */
+struct CannyEdgeParameters
+{
+    int32_t upper_thresh{ 255 };
+    int32_t lower_thresh{ 0 };
+    uint8_t constant_border_value{ 0 };
+};
+
+/** Generate parameters for Canny edge algorithm. */
+CannyEdgeParameters canny_edge_parameters();
+
 /** Helper function to fill the Lut random by a ILutAccessor.
  *
  * @param[in,out] table Accessor at the Lut.
diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
new file mode 100644
index 0000000..5697b62
--- /dev/null
+++ b/tests/validation/NEON/CannyEdge.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/NEON/ArrayAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/ImageFileDatasets.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CannyEdgeFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
+const float allowed_mismatch_ratio = 0.1f;
+
+const auto use_fp16 = framework::dataset::make("UseFP16",
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    true,
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    false
+});
+
+const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
+                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), combine(datasets::BorderModes(), use_fp16)));
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(CannyEdge)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
+               shape, gradient_size, normalization, border_mode, use_fp16, format)
+{
+    CannyEdgeParameters params = canny_edge_parameters();
+    // Convert normalisation type to integer
+    const auto norm_type = static_cast<int>(normalization) + 1;
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
+    Tensor dst = create_tensor<Tensor>(shape, data_type_from_format(format));
+    src.info()->set_format(format);
+    dst.info()->set_format(format);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create Canny edge configure function
+    NECannyEdge canny_edge;
+    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value, use_fp16);
+
+    // Validate valid region
+    validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
+    validate(dst.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
+
+    // Validate padding
+    PaddingCalculator calculator(shape.x(), 8);
+    calculator.set_border_mode(border_mode);
+    calculator.set_border_size(gradient_size / 2);
+    calculator.set_access_offset(-gradient_size / 2);
+    calculator.set_accessed_elements(16);
+
+    validate(src.info()->padding(), calculator.required_padding());
+    validate(dst.info()->padding(), PaddingSize{ 1 });
+}
+
+template <typename T>
+using NECannyEdgeFixture = CannyEdgeValidationFixture<Tensor, Accessor, KeyPointArray, NECannyEdge, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format", Format::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index ac3643e..0c96052 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -448,7 +448,7 @@
         const float   percent_mismatches        = static_cast<float>(num_mismatches) / num_elements * 100.f;
 
         ARM_COMPUTE_TEST_INFO(num_mismatches << " values (" << std::fixed << std::setprecision(2) << percent_mismatches
-                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number << "%)");
+                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number * 100 << "%)");
         ARM_COMPUTE_EXPECT(num_mismatches <= absolute_tolerance_number, framework::LogLevel::ERRORS);
     }
 }
@@ -536,7 +536,7 @@
         const float   percent_mismatches        = static_cast<float>(num_mismatches) / num_elements * 100.f;
 
         ARM_COMPUTE_TEST_INFO(num_mismatches << " values (" << std::fixed << std::setprecision(2) << percent_mismatches
-                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number << "%)");
+                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number * 100 << "%)");
         ARM_COMPUTE_EXPECT(num_mismatches <= absolute_tolerance_number, framework::LogLevel::ERRORS);
     }
 }
@@ -615,7 +615,7 @@
         const float   percent_mismatches        = static_cast<float>(num_mismatches) / num_elements * 100.f;
 
         ARM_COMPUTE_TEST_INFO(num_mismatches << " values (" << std::fixed << std::setprecision(2) << percent_mismatches
-                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number << "%)");
+                              << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number * 100 << "%)");
         ARM_COMPUTE_EXPECT(num_mismatches <= absolute_tolerance_number, framework::LogLevel::ERRORS);
     }
 }
diff --git a/tests/validation/fixtures/CannyEdgeFixture.h b/tests/validation/fixtures/CannyEdgeFixture.h
new file mode 100644
index 0000000..0f37c46
--- /dev/null
+++ b/tests/validation/fixtures/CannyEdgeFixture.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE
+#define ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/CannyEdgeDetector.h"
+
+namespace arm_compute
+{
+class CLCannyEdge;
+class NECannyEdge;
+
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename ArrayType, typename FunctionType, typename T>
+class CannyEdgeValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(std::string image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, bool use_fp16, Format format)
+    {
+        CannyEdgeParameters params = canny_edge_parameters();
+
+        _target = compute_target(image, gradient_size, norm_type, border_mode, use_fp16, format, params);
+        //TODO(COMPMID-543): Add use_fp16 to reference
+        _reference = compute_reference(image, gradient_size, norm_type, border_mode, format, params);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, RawTensor raw)
+    {
+        library->fill(tensor, raw);
+    }
+
+    template <typename F, typename std::enable_if<std::is_same<F, NECannyEdge>::value, int>::type = 0>
+    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16, const CannyEdgeParameters &params)
+    {
+        func.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value, use_fp16);
+    }
+
+    template <typename F, typename std::enable_if<std::is_same<F, CLCannyEdge>::value, int>::type = 0>
+    void configure_target(F &func, TensorType &src, TensorType &dst, int gradient_size, int norm_type, BorderMode border_mode, bool use_fp16, const CannyEdgeParameters &params)
+    {
+        ARM_COMPUTE_UNUSED(use_fp16);
+        ARM_COMPUTE_ERROR_ON(use_fp16);
+        func.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
+    }
+
+    TensorType compute_target(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, bool use_fp16, Format format, const CannyEdgeParameters &params)
+    {
+        // Load the image (cached by the library if loaded before)
+        const RawTensor &raw = library->get(image, format);
+
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(raw.shape(), format);
+        TensorType dst = create_tensor<TensorType>(raw.shape(), format);
+        src.info()->set_format(format);
+        dst.info()->set_format(format);
+
+        // Create Canny edge configure function
+        FunctionType canny_edge;
+        configure_target<FunctionType>(canny_edge, src, dst, gradient_size, static_cast<int>(norm_type) + 1, border_mode, use_fp16, params);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), raw);
+
+        // Compute function
+        canny_edge.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const std::string &image, int gradient_size, MagnitudeType norm_type, BorderMode border_mode, Format format, const CannyEdgeParameters &params)
+    {
+        ARM_COMPUTE_ERROR_ON(format != Format::U8);
+
+        // Load the image (cached by the library if loaded before)
+        const RawTensor &raw = library->get(image, format);
+
+        // Create reference
+        SimpleTensor<T> src{ raw.shape(), format };
+
+        // Fill reference
+        fill(src, raw);
+
+        return reference::canny_edge_detector<T>(src, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_CANNY_EDGE_FIXTURE */
diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
new file mode 100644
index 0000000..45b244f
--- /dev/null
+++ b/tests/validation/reference/CannyEdgeDetector.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CannyEdgeDetector.h"
+
+#include "Utils.h"
+#include "support/ToolchainSupport.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Magnitude.h"
+#include "tests/validation/reference/NonMaximaSuppression.h"
+#include "tests/validation/reference/Phase.h"
+#include "tests/validation/reference/Sobel.h"
+
+#include "tests/SimpleTensorPrinter.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+const auto MARK_ZERO  = 0u;
+const auto MARK_MAYBE = 127u;
+const auto MARK_EDGE  = 255u;
+
+template <typename U, typename T, typename F>
+void trace_edge(SimpleTensor<T> &dst, SimpleTensor<U> &grad_mag, const ValidRegion &valid_region, std::vector<bool> &visited, uint32_t upper_thresh, const F &pixel_at_offset)
+{
+    for(auto i = 0; i < dst.num_elements(); ++i)
+    {
+        Coordinates coord;
+        if(visited[i] || dst[i] != MARK_MAYBE || !is_in_valid_region(valid_region, coord = index2coord(dst.shape(), i)))
+        {
+            continue; // Skip visited or confirmed ZERO/EDGE pixels
+        }
+        visited[i] = true; // Mark as visited
+
+        // Check if connected to a strong edge pixel
+        std::array<U, 8> neighbours =
+        {
+            {
+                pixel_at_offset(grad_mag, coord, -1, 0),
+                pixel_at_offset(grad_mag, coord, 1, 0),
+                pixel_at_offset(grad_mag, coord, -1, -1),
+                pixel_at_offset(grad_mag, coord, +1, +1),
+                pixel_at_offset(grad_mag, coord, 0, -1),
+                pixel_at_offset(grad_mag, coord, 0, +1),
+                pixel_at_offset(grad_mag, coord, +1, -1),
+                pixel_at_offset(grad_mag, coord, -1, +1)
+            }
+        };
+
+        const auto is_edge_connected = std::any_of(neighbours.begin(), neighbours.end(), [&](const U & pixel)
+        {
+            return pixel >= upper_thresh;
+        });
+        dst[i] = is_edge_connected ? MARK_EDGE : MARK_ZERO;
+    }
+}
+
+template <typename U, typename T>
+SimpleTensor<T> canny_edge_detector_impl(const SimpleTensor<T> &src, int32_t upper, int32_t lower, int gradient_size, MagnitudeType norm_type,
+                                         BorderMode border_mode, T constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(gradient_size != 3 && gradient_size != 5 && gradient_size != 7);
+    ARM_COMPUTE_ERROR_ON(lower < 0 || lower >= upper);
+
+    // Output: T == uint8_t
+    SimpleTensor<T> dst{ src.shape(), src.data_type() };
+    ValidRegion     valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(gradient_size / 2 + 1));
+
+    // Sobel computation: U == int16_t or int32_t
+    SimpleTensor<U> gx, gy;
+    std::tie(gx, gy) = sobel<U>(src, gradient_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+    using unsigned_U = typename traits::make_unsigned_conditional_t<U>::type;
+    using promoted_U = typename common_promoted_signed_type<U>::intermediate_type;
+
+    // Gradient magnitude and phase (edge direction)
+    const DataType           mag_data_type = gx.data_type() == DataType::S16 ? DataType::U16 : DataType::U32;
+    SimpleTensor<unsigned_U> grad_mag{ gx.shape(), mag_data_type };
+    SimpleTensor<uint8_t>    grad_dir{ gy.shape(), DataType::U8 };
+
+    for(auto i = 0; i < grad_mag.num_elements(); ++i)
+    {
+        double mag = 0.f;
+
+        if(norm_type == MagnitudeType::L2NORM)
+        {
+            mag = support::cpp11::round(std::sqrt(static_cast<promoted_U>(gx[i]) * gx[i] + static_cast<promoted_U>(gy[i]) * gy[i]));
+        }
+        else // MagnitudeType::L1NORM
+        {
+            mag = static_cast<promoted_U>(std::abs(gx[i])) + static_cast<promoted_U>(std::abs(gy[i]));
+        }
+
+        float angle = 180.f * std::atan2(static_cast<float>(gy[i]), static_cast<float>(gx[i])) / M_PI;
+        grad_dir[i] = support::cpp11::round(angle < 0.f ? 180 + angle : angle);
+        grad_mag[i] = saturate_cast<unsigned_U>(mag);
+    }
+
+    /*
+        Quantise the phase into 4 directions
+          0°  dir=0    0.0 <= p <  22.5 or 157.5 <= p < 180
+         45°  dir=1   22.5 <= p <  67.5
+         90°  dir=2   67.5 <= p < 112.5
+        135°  dir=3  112.5 <= p < 157.5
+    */
+    for(auto i = 0; i < grad_dir.num_elements(); ++i)
+    {
+        const auto direction = std::fabs(grad_dir[i]);
+        grad_dir[i]          = (direction < 22.5 || direction >= 157.5) ? 0 : (direction < 67.5) ? 1 : (direction < 112.5) ? 2 : 3;
+    }
+
+    // Non-maximum suppression
+    std::vector<int> strong_edges;
+    const auto       upper_thresh = static_cast<uint32_t>(upper);
+    const auto       lower_thresh = static_cast<uint32_t>(lower);
+
+    const auto pixel_at_offset = [&](const SimpleTensor<unsigned_U> &tensor, const Coordinates & coord, int xoffset, int yoffset)
+    {
+        return tensor_elem_at(tensor, Coordinates{ coord.x() + xoffset, coord.y() + yoffset }, border_mode, static_cast<unsigned_U>(constant_border_value));
+    };
+
+    for(auto i = 0; i < dst.num_elements(); ++i)
+    {
+        const auto coord = index2coord(dst.shape(), i);
+        if(!is_in_valid_region(valid_region, coord) || grad_mag[i] <= lower_thresh)
+        {
+            dst[i] = MARK_ZERO;
+            continue;
+        }
+
+        unsigned_U mag_90, mag90;
+        switch(grad_dir[i])
+        {
+            case 0: // North/South edge direction, compare against East/West pixels (left & right)
+                mag_90 = pixel_at_offset(grad_mag, coord, -1, 0);
+                mag90  = pixel_at_offset(grad_mag, coord, 1, 0);
+                break;
+            case 1: // NE/SW edge direction, compare against NW/SE pixels (top-left & bottom-right)
+                mag_90 = pixel_at_offset(grad_mag, coord, -1, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, +1, +1);
+                break;
+            case 2: // East/West edge direction, compare against North/South pixels (top & bottom)
+                mag_90 = pixel_at_offset(grad_mag, coord, 0, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, 0, +1);
+                break;
+            case 3: // NW/SE edge direction, compare against NE/SW pixels (top-right & bottom-left)
+                mag_90 = pixel_at_offset(grad_mag, coord, +1, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, -1, +1);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid gradient phase provided");
+                break;
+        }
+
+        // Potential edge if greater than both pixels at +/-90° on either side
+        if(grad_mag[i] > mag_90 && grad_mag[i] > mag90)
+        {
+            // Double thresholding and edge tracing
+            if(grad_mag[i] > upper_thresh)
+            {
+                dst[i] = MARK_EDGE; // Definite edge pixel
+                strong_edges.emplace_back(i);
+            }
+            else
+            {
+                dst[i] = MARK_MAYBE;
+            }
+        }
+        else
+        {
+            dst[i] = MARK_ZERO; // Since not greater than neighbours
+        }
+    }
+
+    // Final edge tracing
+    std::vector<bool> visited(dst.num_elements(), false);
+    trace_edge<unsigned_U>(dst, grad_mag, valid_region, visited, upper_thresh, pixel_at_offset);
+    return dst;
+}
+} // namespace
+
+template <typename T>
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                    BorderMode border_mode, T constant_border_value)
+{
+    if(gradient_size < 7)
+    {
+        return canny_edge_detector_impl<int16_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
+    }
+    else
+    {
+        return canny_edge_detector_impl<int32_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
+    }
+}
+
+template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                                   BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/CannyEdgeDetector.h b/tests/validation/reference/CannyEdgeDetector.h
new file mode 100644
index 0000000..a46c145
--- /dev/null
+++ b/tests/validation/reference/CannyEdgeDetector.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__
+#define __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__
+
+#include "arm_compute/core/Types.h"
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                    BorderMode border_mode, T constant_border_value = 0);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__ */