Port DepthConvert to new Api

- Renames DepthConvert to Cast
- Ports both NEDepthConverLayer and CLDepthConvert variants
- Removes legacy shift capability from DepthConvert, allowing only
shifts of 0

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I806a0f8eb23d23502b632c529fda7edde19c8176
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5565
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 202140d..53256eb 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/runtime/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct CLCast::Impl
+{
+    const ICLTensor                *src{ nullptr };
+    ICLTensor                      *dst{ nullptr };
+    std::unique_ptr<opencl::ClCast> op{ nullptr };
+};
+
+CLCast::CLCast()
+    : _impl(std::make_unique<Impl>())
+{
+}
+CLCast::CLCast(CLCast &&) = default;
+CLCast &CLCast::operator=(CLCast &&) = default;
+CLCast::~CLCast()                    = default;
+
 void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
@@ -36,13 +55,23 @@
 
 void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
 {
-    auto k = std::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(compile_context, input, output, policy, 0);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClCast>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
 Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
 {
-    return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+    return opencl::ClCast::validate(input, output, policy);
+}
+
+void CLCast::run()
+{
+    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 47bc523..6aa370b 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/runtime/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct CLDepthConvertLayer::Impl
+{
+    const ICLTensor                *src{ nullptr };
+    ICLTensor                      *dst{ nullptr };
+    std::unique_ptr<opencl::ClCast> op{ nullptr };
+};
+
+CLDepthConvertLayer::CLDepthConvertLayer()
+    : _impl(std::make_unique<Impl>())
+{
+}
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
+
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
@@ -36,13 +55,27 @@
 
 void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = std::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(compile_context, input, output, policy, shift);
-    _kernel = std::move(k);
+    ARM_COMPUTE_UNUSED(shift);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    ARM_COMPUTE_ERROR_ON(shift != 0);
+
+    _impl->op = std::make_unique<opencl::ClCast>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
 Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
-    return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
+    ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
+    return opencl::ClCast::validate(input, output, policy);
+}
+
+void CLDepthConvertLayer::run()
+{
+    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 991472b..50a145f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -28,7 +28,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 5dc7556..3184d5d 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -31,7 +31,6 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLCol2ImKernel.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 7a01018..d5d1b5f 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -29,7 +29,6 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 099a2c9..3be0958 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -34,12 +34,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/gpu/cl/kernels/ClCastKernel.h"
 #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
@@ -189,7 +189,7 @@
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _weights_to_qasymm8(std::make_unique<CLDepthConvertLayerKernel>()),
+      _weights_to_qasymm8(std::make_unique<opencl::kernels::ClCastKernel>()),
       _mm_native_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
       _mtx_b_reshape_kernel(std::make_unique<opencl::kernels::ClGemmReshapeRhsMatrixKernel>()),
@@ -272,7 +272,7 @@
         TensorInfo weights_info(*b->info());
         weights_info.set_data_type(DataType::QASYMM8);
         _qasymm8_weights.allocator()->init(weights_info);
-        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+        _weights_to_qasymm8->configure(compile_context, b->info(), _qasymm8_weights.info(), ConvertPolicy::WRAP);
     }
 
     const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
@@ -480,7 +480,7 @@
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
+        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
     if(reshape_matrix_b)
@@ -681,7 +681,8 @@
         if(_convert_to_qasymm8)
         {
             _qasymm8_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(*_weights_to_qasymm8, false);
+            ITensorPack convert_to_qs8_pack = { { ACL_SRC, _original_b }, { ACL_DST, &_qasymm8_weights } };
+            CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
         }
 
         if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 146ac8f..85d13c2 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -29,7 +29,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index 6997442..a44dcd2 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7b6ec8f..fcf5b9d 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -30,7 +30,6 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 45ced35..755fa40 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,7 +28,6 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index a42f512..b519576 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,45 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-
-#include <utility>
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuCast.h"
 
 namespace arm_compute
 {
+struct NECast::Impl
+{
+    const ITensor                *src{ nullptr };
+    ITensor                      *dst{ nullptr };
+    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+};
+
+NECast::NECast()
+    : _impl(std::make_unique<Impl>())
+{
+}
+NECast::NECast(NECast &&) = default;
+NECast &NECast::operator=(NECast &&) = default;
+NECast::~NECast()                    = default;
+
 void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy)
 {
-    auto k = std::make_unique<NEDepthConvertLayerKernel>();
-    k->configure(input, output, policy, 0);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+
+    _impl->op = std::make_unique<cpu::CpuCast>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
 }
 
 Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy)
 {
-    return NEDepthConvertLayerKernel::validate(input, output, policy, 0);
+    return cpu::CpuCast::validate(input, output, policy);
+}
+
+void NECast::run()
+{
+    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 761de8e..07e985c 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,51 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuCast.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+struct NEDepthConvertLayer::Impl
+{
+    const ITensor                *src{ nullptr };
+    ITensor                      *dst{ nullptr };
+    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+};
+
+NEDepthConvertLayer::NEDepthConvertLayer()
+    : _impl(std::make_unique<Impl>())
+{
+}
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::~NEDepthConvertLayer()                                 = default;
 
 void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = std::make_unique<NEDepthConvertLayerKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
+    ARM_COMPUTE_UNUSED(shift);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    ARM_COMPUTE_ERROR_ON(shift != 0);
+
+    _impl->op = std::make_unique<cpu::CpuCast>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
 }
 
 Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
-    return NEDepthConvertLayerKernel::validate(input, output, policy, shift);
+    ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
+    return cpu::CpuCast::validate(input, output, policy);
 }
+
+void NEDepthConvertLayer::run()
+{
+    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCast.cpp b/src/runtime/cpu/operators/CpuCast.cpp
new file mode 100644
index 0000000..5a4f6c5
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuCast.h"
+
+#include "src/core/cpu/kernels/CpuCastKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+    auto k = std::make_unique<kernels::CpuCastKernel>();
+    k->configure(src, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    return kernels::CpuCastKernel::validate(src, dst, policy);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCast.h b/src/runtime/cpu/operators/CpuCast.h
new file mode 100644
index 0000000..2aea2d2
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuCast.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CAST_H
+#define ARM_COMPUTE_CPU_CAST_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuCastKernel */
+class CpuCast : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuCast() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst                                             |
+     * |:--------------|:-----------------------------------------------|
+     * |QASYMM8_SIGNED | S16, S32, F32, F16                             |
+     * |QASYMM8        | U16, S16, S32, F32, F16                        |
+     * |U8             | U16, S16, S32, F32, F16                        |
+     * |U16            | U8, U32                                        |
+     * |S16            | QASYMM8_SIGNED, U8, S32                        |
+     * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
+     * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
+     * |F32            | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
+     *
+     * @param[in]  src    The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] dst    The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy Conversion policy.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuCast::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/runtime/gpu/cl/operators/ClCast.cpp b/src/runtime/gpu/cl/operators/ClCast.cpp
new file mode 100644
index 0000000..3f54004
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClCast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClCast.h"
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClCastKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+    auto k = std::make_unique<kernels::ClCastKernel>();
+    k->configure(compile_context, src, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    return kernels::ClCastKernel::validate(src, dst, policy);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClCast.h b/src/runtime/gpu/cl/operators/ClCast.h
new file mode 100644
index 0000000..69e028d
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClCast.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_H
+#define ARM_COMPUTE_CL_CAST_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCastKernel */
+class ClCast : public IClOperator
+{
+public:
+    /** Constructor */
+    ClCast() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @note Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |F16            | U8, S8, U16, S16, U32, F32            |
+     * |F32            | U8, S8, U16, S16, U32, F16            |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy          Conversion policy.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCast::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_H */