Added S64/U64 support for the input in CLCast

* Partially resolves MLCE-1089

Change-Id: Ie3d2fc2f755ae99cdb17b57cc90bb3f99a1843e0
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9909
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 188ae8c..6fa983d 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -164,6 +164,9 @@
         case DataType::S32:
         case DataType::F32:
             return 4;
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
             return 0;
@@ -712,7 +715,7 @@
  *
  * @return The pair with minimum and maximum values
  */
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo& act_info, DataType data_type, UniformQuantizationInfo oq_info);
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info);
 
 /** Convert a tensor format into a string.
  *
@@ -749,7 +752,7 @@
  *
  * @return The string describing the activation function.
  */
-const std::string &string_from_activation_func(const ActivationFunction& act);
+const std::string &string_from_activation_func(const ActivationFunction &act);
 /** Translates a given interpolation policy to a string.
  *
  * @param[in] policy @ref InterpolationPolicy to be translated to string.
@@ -798,7 +801,7 @@
  *
  * @return True if padding is symmetric
  */
-inline bool is_symmetric(const Padding3D& info)
+inline bool is_symmetric(const Padding3D &info)
 {
     return ((info.left == info.right) && (info.top == info.bottom) && (info.front == info.back));
 }
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index d2cea7a..650cd11 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,43 +61,28 @@
      * |src            |dst                                    |
      * |:--------------|:--------------------------------------|
      * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |S8             | U8, U16, S16, U32, S32, F16, F32      |
      * |U16            | U8, S8, S16, U32, S32, F16, F32       |
      * |S16            | U8, S8, U16, U32, S32, F16, F32       |
      * |U32            | U8, S8, U16, S16, S32, F16, F32       |
      * |S32            | U8, S8, U16, S16, U32, F16, F32       |
-     * |F16            | U8, S8, U16, S16, U32, F32            |
-     * |F32            | U8, S8, U16, S16, U32, F16            |
+     * |U64            | U8, S8, U16, S16, U32, S32, F16, F32  |
+     * |S64            | U8, S8, U16, S16, U32, S32, F16, F32  |
+     * |F16            | U8, S8, U16, S16, S32, U32, F32       |
+     * |F32            | U8, S8, U16, S16, S32, U32, F16       |
      *
      * Input data type must be different than output data type.
      *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
      * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy Conversion policy.
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
-    /** Initialize the function's source, destination
-     *
-     * Input data type must be different than output data type.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy.
-     */
+    // Initialize the function's source, destination
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCast
      *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
      * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in] policy Conversion policy.
      *
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 8d34a76..66b8988 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -437,12 +437,15 @@
     <table>
     <tr><th>src<th>dst
     <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32
+    <tr><td>S8<td>U8, U16, S16, U32, S32, F16, F32
     <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32
     <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32
     <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32
     <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32
-    <tr><td>F16<td>U8, S8, U16, S16, U32, F32
-    <tr><td>F32<td>U8, S8, U16, S16, U32, F16
+    <tr><td>U64<td>U8, S8, U16, S16, U32, S32, F16, F32
+    <tr><td>S64<td>U8, S8, U16, S16, U32, S32, F16, F32
+    <tr><td>F16<td>U8, S8, U16, S16, S32, U32, F32
+    <tr><td>F32<td>U8, S8, U16, S16, S32, U32, F16
     </table>
 <tr>
   <td rowspan="2">ChannelShuffleLayer
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index ec96f60..ce96370 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,7 +48,7 @@
    - Add new OpenCL™ kernels:
      - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
    - Enable transposed convolution with non-square kernels on CPU and GPU.
-
+   - Added support for input data type U64/S64 in CLCast.
 v23.05.1 Public patch release
  - Enable CMake and Bazel option to build multi_isa without FP16 support.
  - Fix compilation error in NEReorderLayer (aarch64 only).
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index 6baa31e..991867d 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@
                                                          1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
                                                          DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+                                                         DataType::F32, DataType::S64, DataType::U64);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
                                                          1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index 7fadfa7..a021b3c 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,12 +53,14 @@
      *   - U16 -> U8, S8, S16, U32, S32, F16, F32
      *   - S16 -> U8, S8, U16, U32, S32, F16, F32
      *   - U32 -> U8, S8, U16, S16, S32, F16, F32
+     *   - S64 -> U8, S8, U16, S16, U32, S32, F16, F32
+     *   - U64 -> U8, S8, U16, S16, U32, S32, F16, F32
      *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
+     *   - F16 -> U8, S8, U16, S16, U32, S32, F32
+     *   - F32 -> U8, S8, U16, S16, U32, S32, F16
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/U64/S64/F16/F32.
      * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy
      */
diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index 84455ba..3d04b80 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, 2022 Arm Limited.
+ * Copyright (c) 2018-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,6 +122,26 @@
 const auto CastF32toU32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U32));
 const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+
+// U64
+const auto CastU64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U8));
+const auto CastU64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S8));
+const auto CastU64toU16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U16));
+const auto CastU64toS16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S16));
+const auto CastU64toU32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U32));
+const auto CastU64toS32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S32));
+const auto CastU64toF16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F16));
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U8));
+const auto CastS64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S8));
+const auto CastS64toU16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U16));
+const auto CastS64toS16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S16));
+const auto CastS64toU32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U32));
+const auto CastS64toS32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S32));
+const auto CastS64toF16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F16));
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
 } // namespace
 
 TEST_SUITE(CL)
@@ -227,6 +247,24 @@
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, CLCastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, CLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
 
+// S64
+CAST_SUITE(S64_to_U8, DataType::S64, DataType::U8, CLCastToU8Fixture<int64_t>, CastS64toU8Dataset, one_tolerance)
+CAST_SUITE(S64_to_S8, DataType::S64, DataType::S8, CLCastToS8Fixture<int64_t>, CastS64toS8Dataset, one_tolerance)
+CAST_SUITE(S64_to_U16, DataType::S64, DataType::U16, CLCastToU16Fixture<int64_t>, CastS64toU16Dataset, one_tolerance)
+CAST_SUITE(S64_to_S16, DataType::S64, DataType::S16, CLCastToS16Fixture<int64_t>, CastS64toS16Dataset, one_tolerance)
+CAST_SUITE(S64_to_U32, DataType::S64, DataType::U32, CLCastToU32Fixture<int64_t>, CastS64toU32Dataset, one_tolerance)
+CAST_SUITE(S64_to_S32, DataType::S64, DataType::S32, CLCastToS32Fixture<int64_t>, CastS64toS32Dataset, one_tolerance)
+CAST_SUITE(S64_to_F16, DataType::S64, DataType::F16, CLCastToF16Fixture<int64_t>, CastS64toF16Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_U8, DataType::U64, DataType::U8, CLCastToU8Fixture<uint64_t>, CastU64toU8Dataset, one_tolerance)
+CAST_SUITE(U64_to_S8, DataType::U64, DataType::S8, CLCastToS8Fixture<uint64_t>, CastU64toS8Dataset, one_tolerance)
+CAST_SUITE(U64_to_U16, DataType::U64, DataType::U16, CLCastToU16Fixture<uint64_t>, CastU64toU16Dataset, one_tolerance)
+CAST_SUITE(U64_to_S16, DataType::U64, DataType::S16, CLCastToS16Fixture<uint64_t>, CastU64toS16Dataset, one_tolerance)
+CAST_SUITE(U64_to_U32, DataType::U64, DataType::U32, CLCastToU32Fixture<uint64_t>, CastU64toU32Dataset, one_tolerance)
+CAST_SUITE(U64_to_S32, DataType::U64, DataType::S32, CLCastToS32Fixture<uint64_t>, CastU64toS32Dataset, one_tolerance)
+CAST_SUITE(U64_to_F16, DataType::U64, DataType::F16, CLCastToF16Fixture<uint64_t>, CastU64toF16Dataset, zero_tolerance)
+
 TEST_SUITE_END() // Cast
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/fixtures/CastFixture.h b/tests/validation/fixtures/CastFixture.h
index 60b4c2b..e9d624e 100644
--- a/tests/validation/fixtures/CastFixture.h
+++ b/tests/validation/fixtures/CastFixture.h
@@ -85,6 +85,16 @@
                     library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(signed_min), static_cast<int32_t>(signed_max));
                     break;
                 }
+                case DataType::U64:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<uint64_t>(unsigned_min), static_cast<uint64_t>(unsigned_max));
+                    break;
+                }
+                case DataType::S64:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<int64_t>(signed_min), static_cast<int64_t>(signed_max));
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("NOT SUPPORTED!");
             }
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 94c719a..8797722 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -186,6 +186,23 @@
 template SimpleTensor<half> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 template SimpleTensor<bfloat16> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
+// S64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+
+// U64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 } // namespace reference
 } // namespace validation
 } // namespace test