Reference model changes for fp16 support

Change-Id: I72f21fcfa153046274969d327313e3349981dbe6
Signed-off-by: James Ward <james.ward@arm.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3281c6..6556b27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@
 option(SERIALIZATION_DIR "Location where the TOSA Serialization Library 'include' folder is found" Off)
 option(FLATBUFFERS_DIR "Location where the FlatBuffers 'include' and 'lib' folders is found" Off)
 option(EIGEN_DIR "Location where the Eigen folder is found" Off)
+option(HALF_DIR "Location where the Half folder is found" Off)
 option(DOCTEST_DIR "Location where the doctest folder is found (If building unit tests)" Off)
 
 add_subdirectory(thirdparty)
diff --git a/reference_model/CMakeLists.txt b/reference_model/CMakeLists.txt
index a790968..04b0db5 100644
--- a/reference_model/CMakeLists.txt
+++ b/reference_model/CMakeLists.txt
@@ -43,7 +43,7 @@
   set(SERIALIZATION_DIR "../thirdparty/serialization_lib/")
 endif()
 
-# If Flatbuffers or Eigen path isn't specified, set to thirdparty directory.
+# If Flatbuffers, Eigen, Half path isn't specified, set to thirdparty directory.
 if(NOT FLATBUFFERS_DIR)
   set(FLATBUFFERS_DIR "../thirdparty/serialization_lib/third_party/flatbuffers/")
 endif()
@@ -52,6 +52,10 @@
   set(EIGEN_DIR "../thirdparty/eigen/")
 endif()
 
+if(NOT HALF_DIR)
+  set(HALF_DIR "../thirdparty/serialization_lib/third_party/half")
+endif()
+
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 # Common sources required for TOSA Reference Model library, executable and unit tests
@@ -92,6 +96,7 @@
     ${EIGEN_DIR}
     ${EIGEN_DIR}/unsupported/
     ${SERIALIZATION_DIR}/include
+    ${HALF_DIR}/include
 )
 
 target_link_libraries(tosa_reference_model_lib
@@ -130,6 +135,7 @@
       ${EIGEN_DIR}
       ${EIGEN_DIR}/unsupported/
       ${SERIALIZATION_DIR}/include
+      ${HALF_DIR}/include
   )
 
   target_link_libraries(tosa_reference_model
@@ -171,6 +177,7 @@
       ${EIGEN_DIR}
       ${EIGEN_DIR}/unsupported/
       ${SERIALIZATION_DIR}/include
+      ${HALF_DIR}/include
       ${DOCTEST_DIR}
     )
 
@@ -203,6 +210,7 @@
       ${EIGEN_DIR}
       ${EIGEN_DIR}/unsupported/
       ${SERIALIZATION_DIR}/include
+      ${HALF_DIR}/include
   )
 
   target_link_libraries(model_runner_sample
diff --git a/reference_model/src/graph_node.h b/reference_model/src/graph_node.h
index adf81b2..874f1d8 100644
--- a/reference_model/src/graph_node.h
+++ b/reference_model/src/graph_node.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -24,6 +24,9 @@
 
 #define DEF_INSTANTIATE_ONE_RANK_ONE_TYPE(OP, RANK, DTYPE) template class TosaReference::OP<RANK, DType_##DTYPE>;
 
+#define DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, RANK, DTYPE, ACCUM_DTYPE)                                      \
+    template class TosaReference::OP<RANK, DType_##DTYPE, DType_##ACCUM_DTYPE>;
+
 #define DEF_INSTANTIATE_ONE_RANK_TWO_TYPE(OP, RANK, DTYPE1, DTYPE2)                                                    \
     template class TosaReference::OP<RANK, DType_##DTYPE1, DType_##DTYPE2>;
 
@@ -35,8 +38,14 @@
 
 #define DEF_INSTANTIATE_ONE_TYPE(OP, DTYPE) template class TosaReference::OP<DType_##DTYPE>;
 
+#define DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OP, DTYPE, ACCUM_DTYPE)                                                     \
+    template class TosaReference::OP<DType_##DTYPE, DType_##ACCUM_DTYPE>;
+
 #define DEF_INSTANTIATE_TWO_TYPE(OP, DTYPE1, DTYPE2) template class TosaReference::OP<DType_##DTYPE1, DType_##DTYPE2>;
 
+#define DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OP, DTYPE1, DTYPE2, ACCUM_DTYPE)                                            \
+    template class TosaReference::OP<DType_##DTYPE1, DType_##DTYPE2, DType_##ACCUM_DTYPE>;
+
 #define DEF_INSTANTIATE_THREE_TYPE(OP, DTYPE1, DTYPE2, OP_TYPE)                                                        \
     template class TosaReference::OP<DType_##DTYPE1, DType_##DTYPE2, OP_TYPE>;
 
@@ -57,6 +66,14 @@
     DEF_INSTANTIATE_ONE_RANK_ONE_TYPE(OP, 5, DTYPE)                                                                    \
     DEF_INSTANTIATE_ONE_RANK_ONE_TYPE(OP, 6, DTYPE)
 
+#define DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, DTYPE, ACCUM_DTYPE)                                    \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 1, DTYPE, ACCUM_DTYPE)                                             \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 2, DTYPE, ACCUM_DTYPE)                                             \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 3, DTYPE, ACCUM_DTYPE)                                             \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 4, DTYPE, ACCUM_DTYPE)                                             \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 5, DTYPE, ACCUM_DTYPE)                                             \
+    DEF_INSTANTIATE_ONE_RANK_ONE_TYPE_ONE_ACCUM(OP, 6, DTYPE, ACCUM_DTYPE)
+
 #define DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OP, DTYPE1, DTYPE2)                                                  \
     DEF_INSTANTIATE_ONE_RANK_TWO_TYPE(OP, 0, DTYPE1, DTYPE2)                                                           \
     DEF_INSTANTIATE_ONE_RANK_TWO_TYPE(OP, 1, DTYPE1, DTYPE2)                                                           \
diff --git a/reference_model/src/ops/activation_funcs.cc b/reference_model/src/ops/activation_funcs.cc
index c344bcb..1c0c23a 100644
--- a/reference_model/src/ops/activation_funcs.cc
+++ b/reference_model/src/ops/activation_funcs.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         {
             InEigenType min = (InEigenType)attribute->min_fp();
@@ -57,6 +58,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return (1.0 / (1.0 + (expf(-1.0 * a)))); };
             break;
@@ -72,6 +74,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return tanhf(a); };
             break;
@@ -83,10 +86,13 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, INT16);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSigmoid, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSigmoid, FLOAT);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTanh, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTanh, FLOAT);
diff --git a/reference_model/src/ops/comparison.cc b/reference_model/src/ops/comparison.cc
index ab89e24..5930c1a 100644
--- a/reference_model/src/ops/comparison.cc
+++ b/reference_model/src/ops/comparison.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         case DType_INT32:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a == b; };
@@ -43,6 +44,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         case DType_INT32:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a > b; };
@@ -59,6 +61,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         case DType_INT32:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a >= b; };
@@ -71,11 +74,14 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, INT32);
diff --git a/reference_model/src/ops/data_layout.cc b/reference_model/src/ops/data_layout.cc
index 7840450..1ed0be2 100644
--- a/reference_model/src/ops/data_layout.cc
+++ b/reference_model/src/ops/data_layout.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -190,6 +190,7 @@
         case DType_INT32:
             pad_value = (InEigenType)attribute->pad_const_int();
             break;
+        case DType_FP16:
         case DType_FLOAT:
             pad_value = (InEigenType)attribute->pad_const_fp();
             break;
@@ -637,42 +638,49 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, FP16)
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, FLOAT)
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, INT8)
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, INT16)
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, INT32)
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, BOOL)
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, INT8);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, INT16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, INT32);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, BOOL);
 
+DEF_INSTANTIATE_RESHAPE(OpReshape, FP16);
 DEF_INSTANTIATE_RESHAPE(OpReshape, FLOAT);
 DEF_INSTANTIATE_RESHAPE(OpReshape, INT8);
 DEF_INSTANTIATE_RESHAPE(OpReshape, INT16);
 DEF_INSTANTIATE_RESHAPE(OpReshape, INT32);
 DEF_INSTANTIATE_RESHAPE(OpReshape, BOOL);
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, INT8);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, INT16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, INT32);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, BOOL);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, INT32);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, BOOL);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, INT32);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, BOOL);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, INT16);
diff --git a/reference_model/src/ops/data_nodes.cc b/reference_model/src/ops/data_nodes.cc
index 30c9511..4ff08be 100644
--- a/reference_model/src/ops/data_nodes.cc
+++ b/reference_model/src/ops/data_nodes.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -89,6 +89,7 @@
 // template explicit instantiation
 // note OpConst is not templated
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, INT16);
diff --git a/reference_model/src/ops/ewise_binary.cc b/reference_model/src/ops/ewise_binary.cc
index a5e1a20..917d56e 100644
--- a/reference_model/src/ops/ewise_binary.cc
+++ b/reference_model/src/ops/ewise_binary.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -142,6 +142,7 @@
                 return static_cast<InEigenType>(res_in_64);
             };
             break;
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a + b; };
             break;
@@ -369,6 +370,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         case DType_INT32:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a > b ? a : b; };
@@ -385,6 +387,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
         case DType_INT32:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a < b ? a : b; };
@@ -403,6 +406,7 @@
 
     switch (InDtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [shift](InEigenType a, InEigenType b) -> OutEigenType { return a * b; };
             break;
@@ -452,6 +456,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return powf(a, b); };
             break;
@@ -476,6 +481,7 @@
                 return static_cast<InEigenType>(res_in_64);
             };
             break;
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a, InEigenType b) -> OutEigenType { return a - b; };
             break;
@@ -574,6 +580,7 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, INT32);
 
@@ -609,24 +616,32 @@
 
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpLogicalXor, BOOL);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, FP16, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, FLOAT, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT8, INT32);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT16, INT32);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT32, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpPow, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpPow, FLOAT);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, INT32);
 
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTable, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpTable, INT16);
 
+// Instantiation of nodes for comparison operators opEqual, opGreater
+// and opGreaterEqual
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(BinaryNode, FP16, BOOL);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(BinaryNode, FLOAT, BOOL);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(BinaryNode, INT32, BOOL);
diff --git a/reference_model/src/ops/ewise_ternary.cc b/reference_model/src/ops/ewise_ternary.cc
index 72fe5a0..da046a7 100644
--- a/reference_model/src/ops/ewise_ternary.cc
+++ b/reference_model/src/ops/ewise_ternary.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -107,6 +107,7 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, INT16);
diff --git a/reference_model/src/ops/ewise_unary.cc b/reference_model/src/ops/ewise_unary.cc
index 8ef1e3c..52f5aff 100644
--- a/reference_model/src/ops/ewise_unary.cc
+++ b/reference_model/src/ops/ewise_unary.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -79,6 +79,7 @@
     switch (Dtype)
     {
         case DType_FLOAT:
+        case DType_FP16:
         case DType_INT32:
             this->fcn = [](InEigenType a) -> OutEigenType { return a > (InEigenType)0 ? a : (-a); };
             break;
@@ -111,6 +112,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return ceilf(a); };
             break;
@@ -158,6 +160,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return expf(a); };
             break;
@@ -173,6 +176,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return floorf(a); };
             break;
@@ -188,6 +192,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return logf(a); };
             break;
@@ -239,6 +244,7 @@
 
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType {
                 InEigenType result = -(a);
@@ -290,6 +296,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return 1.0 / a; };
             break;
@@ -305,6 +312,7 @@
 {
     switch (Dtype)
     {
+        case DType_FP16:
         case DType_FLOAT:
             this->fcn = [](InEigenType a) -> OutEigenType { return 1.0 / sqrtf(a); };
             break;
@@ -316,6 +324,7 @@
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, INT32);
 
@@ -323,23 +332,30 @@
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpBitwiseNot, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpBitwiseNot, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpCeil, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpCeil, FLOAT);
 
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpClz, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpExp, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpExp, FLOAT);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpFloor, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpFloor, FLOAT);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpLog, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpLog, FLOAT);
 
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpLogicalNot, BOOL);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT32);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpRsqrt, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpRsqrt, FLOAT);
 
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpReciprocal, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_ONE_TYPE(OpReciprocal, FLOAT);
diff --git a/reference_model/src/ops/image.cc b/reference_model/src/ops/image.cc
index a021226..891261b 100644
--- a/reference_model/src/ops/image.cc
+++ b/reference_model/src/ops/image.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@
 
     if (this->mode == ResizeMode_BILINEAR)
     {
-        if (OutDtype != DType_INT32 && OutDtype != DType_INT48 && OutDtype != DType_FLOAT)
+        if (OutDtype != DType_INT32 && OutDtype != DType_INT48 && OutDtype != DType_FLOAT && OutDtype != DType_FP16)
         {
             printNodeValidationError("OpResize: invalid data type for BILINEAR");
             return 1;
@@ -71,7 +71,7 @@
     }
     else
     {
-        if (OutDtype != DType_INT8 && OutDtype != DType_INT16 && OutDtype != DType_FLOAT)
+        if (OutDtype != DType_INT8 && OutDtype != DType_INT16 && OutDtype != DType_FLOAT && OutDtype != DType_FP16)
         {
             printNodeValidationError("OpResize: invalid data type for NEAREST");
             return 1;
@@ -224,4 +224,5 @@
 DEF_INSTANTIATE_THREE_TYPE(OpResize, INT8, INT8, int16_t);
 DEF_INSTANTIATE_THREE_TYPE(OpResize, INT16, INT48, int16_t);
 DEF_INSTANTIATE_THREE_TYPE(OpResize, INT16, INT16, int16_t);
+DEF_INSTANTIATE_THREE_TYPE(OpResize, FP16, FP16, float);
 DEF_INSTANTIATE_THREE_TYPE(OpResize, FLOAT, FLOAT, float);
diff --git a/reference_model/src/ops/op_factory.cc b/reference_model/src/ops/op_factory.cc
index b6a2e15..fd73eb5 100644
--- a/reference_model/src/ops/op_factory.cc
+++ b/reference_model/src/ops/op_factory.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -48,71 +48,91 @@
     {
         // tensor_ops
         case Op_ARGMAX:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, INT8);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, INT16);
             break;
         case Op_AVG_POOL2D:
-            DEF_FACTORY_ONE_TYPE(OpAvgPool2d, FLOAT);
-            DEF_FACTORY_ONE_TYPE(OpAvgPool2d, INT8);
-            DEF_FACTORY_ONE_TYPE(OpAvgPool2d, INT16);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, Pool, FP16, FP16);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, Pool, FP16, FLOAT);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, Pool, FLOAT, FLOAT);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, Pool, INT8, INT32);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, Pool, INT16, INT32);
             break;
         case Op_CONV2D:
-            DEF_FACTORY_TWO_TYPE(OpConv2d, FLOAT, FLOAT);
-            DEF_FACTORY_TWO_TYPE(OpConv2d, INT8, INT4);
-            DEF_FACTORY_TWO_TYPE(OpConv2d, INT8, INT8);
-            DEF_FACTORY_TWO_TYPE(OpConv2d, INT16, INT8);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, FP16, FP16, FP16);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, FP16, FP16, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, FLOAT, FLOAT, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, INT8, INT4, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, INT8, INT8, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv2d, Conv, INT16, INT8, INT48);
             break;
         case Op_CONV3D:
-            DEF_FACTORY_TWO_TYPE(OpConv3d, FLOAT, FLOAT);
-            DEF_FACTORY_TWO_TYPE(OpConv3d, INT8, INT4);
-            DEF_FACTORY_TWO_TYPE(OpConv3d, INT8, INT8);
-            DEF_FACTORY_TWO_TYPE(OpConv3d, INT16, INT8);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, FP16, FP16, FP16);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, FP16, FP16, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, FLOAT, FLOAT, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, INT8, INT4, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, INT8, INT8, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpConv3d, Conv, INT16, INT8, INT48);
             break;
         case Op_DEPTHWISE_CONV2D:
-            DEF_FACTORY_TWO_TYPE(OpDepthwiseConv2d, FLOAT, FLOAT);
-            DEF_FACTORY_TWO_TYPE(OpDepthwiseConv2d, INT8, INT4);
-            DEF_FACTORY_TWO_TYPE(OpDepthwiseConv2d, INT8, INT8);
-            DEF_FACTORY_TWO_TYPE(OpDepthwiseConv2d, INT16, INT8);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, FP16, FP16, FP16);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, FP16, FP16, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, FLOAT, FLOAT, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, INT8, INT4, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, INT8, INT8, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, Conv, INT16, INT8, INT48);
             break;
         case Op_FULLY_CONNECTED:
-            DEF_FACTORY_TWO_TYPE(OpFullyConnected, FLOAT, FLOAT);
-            DEF_FACTORY_TWO_TYPE(OpFullyConnected, INT8, INT4);
-            DEF_FACTORY_TWO_TYPE(OpFullyConnected, INT8, INT8);
-            DEF_FACTORY_TWO_TYPE(OpFullyConnected, INT16, INT8);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, FP16, FP16, FP16);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, FP16, FP16, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, FLOAT, FLOAT, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, INT8, INT4, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, INT8, INT8, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FullyConnected, INT16, INT8, INT48);
             break;
         case Op_MATMUL:
-            DEF_FACTORY_ONE_TYPE(OpMatMul, FLOAT);
-            DEF_FACTORY_ONE_TYPE(OpMatMul, INT8);
-            DEF_FACTORY_ONE_TYPE(OpMatMul, INT16);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpMatMul, MatMul, FP16, FP16);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpMatMul, MatMul, FP16, FLOAT);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpMatMul, MatMul, FLOAT, FLOAT);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpMatMul, MatMul, INT8, INT32);
+            DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OpMatMul, MatMul, INT16, INT48);
             break;
         case Op_MAX_POOL2D:
+            DEF_FACTORY_ONE_TYPE(OpMaxPool2d, FP16);
             DEF_FACTORY_ONE_TYPE(OpMaxPool2d, FLOAT);
             DEF_FACTORY_ONE_TYPE(OpMaxPool2d, INT8);
             DEF_FACTORY_ONE_TYPE(OpMaxPool2d, INT16);
             break;
         case Op_TRANSPOSE_CONV2D:
-            DEF_FACTORY_TWO_TYPE(OpTransposeConv2d, FLOAT, FLOAT);
-            DEF_FACTORY_TWO_TYPE(OpTransposeConv2d, INT8, INT4);
-            DEF_FACTORY_TWO_TYPE(OpTransposeConv2d, INT8, INT8);
-            DEF_FACTORY_TWO_TYPE(OpTransposeConv2d, INT16, INT8);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, FP16, FP16, FP16);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, FP16, FP16, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, FLOAT, FLOAT, FLOAT);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, INT8, INT4, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, INT8, INT8, INT32);
+            DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, TransposeConv, INT16, INT8, INT48);
             break;
 
         // activation_funcs
         case Op_CLAMP:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpClamp, INT16);
             break;
         case Op_SIGMOID:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSigmoid, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSigmoid, FLOAT);
             break;
         case Op_TANH:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTanh, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTanh, FLOAT);
             break;
 
         // ewise_binary
         case Op_ADD:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAdd, INT32);
             break;
@@ -159,23 +179,28 @@
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpLogicalXor, BOOL);
             break;
         case Op_MAXIMUM:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMaximum, INT32);
             break;
         case Op_MINIMUM:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpMinimum, INT32);
             break;
         case Op_MUL:
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, FP16, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, FLOAT, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT8, INT32);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT16, INT32);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpMul, INT32, INT32);
             break;
         case Op_POW:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpPow, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpPow, FLOAT);
             break;
         case Op_SUB:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSub, INT32);
             break;
@@ -186,6 +211,7 @@
 
         // ewise_unary
         case Op_ABS:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpAbs, INT32);
             break;
@@ -195,38 +221,46 @@
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpBitwiseNot, INT32);
             break;
         case Op_CEIL:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpCeil, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpCeil, FLOAT);
             break;
         case Op_CLZ:
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpClz, INT32);
             break;
         case Op_EXP:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpExp, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpExp, FLOAT);
             break;
         case Op_FLOOR:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpFloor, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpFloor, FLOAT);
             break;
         case Op_LOG:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpLog, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpLog, FLOAT);
             break;
         case Op_LOGICAL_NOT:
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpLogicalNot, BOOL);
             break;
         case Op_NEGATE:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpNegate, INT32);
             break;
         case Op_RECIPROCAL:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpReciprocal, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpReciprocal, FLOAT);
             break;
         case Op_RSQRT:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpRsqrt, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpRsqrt, FLOAT);
             break;
 
         // ewise_ternary
         case Op_SELECT:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSelect, INT16);
@@ -236,14 +270,17 @@
 
         // comparison
         case Op_EQUAL:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpEqual, INT32);
             break;
         case Op_GREATER:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreater, INT32);
             break;
         case Op_GREATER_EQUAL:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpGreaterEqual, INT32);
             break;
@@ -256,27 +293,32 @@
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceAny, BOOL);
             break;
         case Op_REDUCE_MAX:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT8);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT32);
             break;
         case Op_REDUCE_MIN:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT8);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT32);
             break;
         case Op_REDUCE_PRODUCT:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceProduct, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceProduct, FLOAT);
             break;
         case Op_REDUCE_SUM:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSum, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSum, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSumInt, INT32);
             break;
 
         // data layout
         case Op_CONCAT:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, INT8);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, INT16);
@@ -284,6 +326,7 @@
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpConcat, BOOL);
             break;
         case Op_PAD:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, INT32);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, INT8);
@@ -291,6 +334,7 @@
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpPad, BOOL);
             break;
         case Op_RESHAPE:
+            DEF_FACTORY_RESHAPE(OpReshape, FP16);
             DEF_FACTORY_RESHAPE(OpReshape, FLOAT);
             DEF_FACTORY_RESHAPE(OpReshape, INT8);
             DEF_FACTORY_RESHAPE(OpReshape, INT16);
@@ -298,6 +342,7 @@
             DEF_FACTORY_RESHAPE(OpReshape, BOOL);
             break;
         case Op_REVERSE:
+            DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, FP16);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, FLOAT);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, INT8);
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, INT16);
@@ -305,6 +350,7 @@
             DEF_FACTORY_RANK1_6_ONE_RANK_ONE_TYPE(OpReverse, BOOL);
             break;
         case Op_SLICE:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, INT16);
@@ -312,6 +358,7 @@
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpSlice, BOOL);
             break;
         case Op_TILE:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTile, INT16);
@@ -320,6 +367,7 @@
             break;
         case Op_TRANSPOSE:
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, BOOL);
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpTranspose, INT16);
@@ -331,12 +379,14 @@
             DEF_FACTORY_ONE_TYPE(OpGather, INT8);
             DEF_FACTORY_ONE_TYPE(OpGather, INT16);
             DEF_FACTORY_ONE_TYPE(OpGather, INT32);
+            DEF_FACTORY_ONE_TYPE(OpGather, FP16);
             DEF_FACTORY_ONE_TYPE(OpGather, FLOAT);
             break;
         case Op_SCATTER:
             DEF_FACTORY_ONE_TYPE(OpScatter, INT8);
             DEF_FACTORY_ONE_TYPE(OpScatter, INT16);
             DEF_FACTORY_ONE_TYPE(OpScatter, INT32);
+            DEF_FACTORY_ONE_TYPE(OpScatter, FP16);
             DEF_FACTORY_ONE_TYPE(OpScatter, FLOAT);
             break;
 
@@ -346,6 +396,7 @@
             DEF_FACTORY_TWO_TYPE_RESIZE_INT16(OpResize, INT8, INT8);
             DEF_FACTORY_TWO_TYPE_RESIZE_INT16(OpResize, INT16, INT48);
             DEF_FACTORY_TWO_TYPE_RESIZE_INT16(OpResize, INT16, INT16);
+            DEF_FACTORY_TWO_TYPE_RESIZE_FP16(OpResize, FP16, FP16);
             DEF_FACTORY_TWO_TYPE_RESIZE_FLOAT(OpResize, FLOAT, FLOAT);
             break;
 
@@ -353,6 +404,7 @@
         case Op_CONST:
             return new OpConst(sgt, id);
         case Op_IDENTITY:
+            DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, INT32);
             DEF_FACTORY_RANK0_6_ONE_RANK_ONE_TYPE(OpIdentity, INT8);
@@ -368,15 +420,21 @@
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, BOOL);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, INT16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, INT32);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, BOOL);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, INT32);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, FLOAT);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, BOOL);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, INT16);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, FP16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, FLOAT);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT8);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT16);
+            DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT32);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT8);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT16);
             DEF_FACTORY_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT32);
diff --git a/reference_model/src/ops/op_factory.h b/reference_model/src/ops/op_factory.h
index 341d7dc..25dfc6e 100644
--- a/reference_model/src/ops/op_factory.h
+++ b/reference_model/src/ops/op_factory.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -62,18 +62,55 @@
         return new OP<DType_##DTYPE>(sgt, attribute, id);                                                              \
     }
 
+#define DEF_FACTORY_ONE_TYPE_ONE_ACCUM(OP, ATTR_NAME, DTYPE, ACCUM_DTYPE)                                              \
+    if (inputDType == DType_##DTYPE && ACCUM_FROM_ATTRIBUTE(ATTR_NAME) == DType_##ACCUM_DTYPE)                         \
+    {                                                                                                                  \
+        return new OP<DType_##DTYPE, DType_##ACCUM_DTYPE>(sgt, attribute, id);                                         \
+    }
+
 #define DEF_FACTORY_TWO_TYPE(OP, DTYPE1, DTYPE2)                                                                       \
     if (inputDType == DType_##DTYPE1 && weightDType == DType_##DTYPE2)                                                 \
     {                                                                                                                  \
         return new OP<DType_##DTYPE1, DType_##DTYPE2>(sgt, attribute, id);                                             \
     }
 
+#define DEF_FACTORY_TWO_TYPE_ONE_ACCUM(OP, ATTR_NAME, DTYPE1, DTYPE2, ACCUM_DTYPE)                                     \
+    if (inputDType == DType_##DTYPE1 && weightDType == DType_##DTYPE2                                                  \
+            && ACCUM_FROM_ATTRIBUTE(ATTR_NAME) == DType_##ACCUM_DTYPE)                                                 \
+    {                                                                                                                  \
+        return new OP<DType_##DTYPE1, DType_##DTYPE2, DType_##ACCUM_DTYPE>(sgt, attribute, id);                        \
+    }                                                                                                                  \
+
+// Statement-expression to evaluate accumulate attribute in-place
+#define ACCUM_FROM_ATTRIBUTE(ATTRIBUTE_NAME)                                                                           \
+    ({                                                                                                                 \
+        tosa::DType accumDType = tosa::DType_UNKNOWN;                                                                  \
+        if (auto p = dynamic_cast<tosa::Tosa##ATTRIBUTE_NAME##Attribute*>(attribute))                                  \
+        {                                                                                                              \
+            auto attr = new tosa::Tosa##ATTRIBUTE_NAME##Attribute(p);                                                  \
+            ASSERT_MEM(attr);                                                                                          \
+            accumDType = tosa::EnumValuesDType()[attr->accum_dtype()];                                                 \
+        }                                                                                                              \
+        else                                                                                                           \
+        {                                                                                                              \
+            FATAL_ERROR("Can't initialize Tosa" #ATTRIBUTE_NAME "Attribute.\nPre-initialization "                      \
+                        "of this attribute is required in order to determine the accumulate type.");                   \
+        }                                                                                                              \
+        accumDType;                                                                                                    \
+    })                                                                                                                 \
+
 #define DEF_FACTORY_TWO_TYPE_RESIZE_INT16(OP, DTYPE1, DTYPE2)                                                          \
     if (inputDType == DType_##DTYPE1 && outputDType == DType_##DTYPE2)                                                 \
     {                                                                                                                  \
         return new OP<DType_##DTYPE1, DType_##DTYPE2, int16_t>(sgt, attribute, id);                                    \
     }
 
+#define DEF_FACTORY_TWO_TYPE_RESIZE_FP16(OP, DTYPE1, DTYPE2)                                                           \
+    if (inputDType == DType_##DTYPE1 && outputDType == DType_##DTYPE2)                                                 \
+    {                                                                                                                  \
+        return new OP<DType_##DTYPE1, DType_##DTYPE2, float>(sgt, attribute, id);                                      \
+    }
+
 #define DEF_FACTORY_TWO_TYPE_RESIZE_FLOAT(OP, DTYPE1, DTYPE2)                                                          \
     if (inputDType == DType_##DTYPE1 && outputDType == DType_##DTYPE2)                                                 \
     {                                                                                                                  \
diff --git a/reference_model/src/ops/reduction.cc b/reference_model/src/ops/reduction.cc
index 18fac44..03ee660 100644
--- a/reference_model/src/ops/reduction.cc
+++ b/reference_model/src/ops/reduction.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -158,17 +158,21 @@
 
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceAny, BOOL);
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT8);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMax, INT32);
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT8);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceMin, INT32);
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceProduct, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceProduct, FLOAT);
 
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSum, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSum, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpReduceSumInt, INT32);
diff --git a/reference_model/src/ops/scatter_gather.cc b/reference_model/src/ops/scatter_gather.cc
index faf7db9..25174bd 100644
--- a/reference_model/src/ops/scatter_gather.cc
+++ b/reference_model/src/ops/scatter_gather.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -226,9 +226,11 @@
 DEF_INSTANTIATE_ONE_TYPE(OpGather, INT8);
 DEF_INSTANTIATE_ONE_TYPE(OpGather, INT16);
 DEF_INSTANTIATE_ONE_TYPE(OpGather, INT32);
+DEF_INSTANTIATE_ONE_TYPE(OpGather, FP16);
 DEF_INSTANTIATE_ONE_TYPE(OpGather, FLOAT);
 
 DEF_INSTANTIATE_ONE_TYPE(OpScatter, INT8);
 DEF_INSTANTIATE_ONE_TYPE(OpScatter, INT16);
 DEF_INSTANTIATE_ONE_TYPE(OpScatter, INT32);
+DEF_INSTANTIATE_ONE_TYPE(OpScatter, FP16);
 DEF_INSTANTIATE_ONE_TYPE(OpScatter, FLOAT);
diff --git a/reference_model/src/ops/template_types.h b/reference_model/src/ops/template_types.h
index 2bc7e04..9511c31 100644
--- a/reference_model/src/ops/template_types.h
+++ b/reference_model/src/ops/template_types.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "tosa_generated.h"
 #include <Eigen/CXX11/Tensor>
+#include "half.hpp"
 
 using namespace tosa;
 
@@ -69,6 +70,12 @@
     using type = float;
 };
 template <>
+struct GetEigenType<DType_FP16>
+{
+    // NOTE: full precision used
+    using type = float;
+};
+template <>
 struct GetEigenType<DType_INT32>
 {
     using type = int32_t;
@@ -109,6 +116,28 @@
     using type = int32_t;
 };
 
+/* Get Accumulate Eigen Type:
+Same behaviour as GetEigenType for all DTypes except the
+single specialised case of DType_FP16. */
+template <DType Dtype>
+struct GetAccEigenType;
+template <>
+struct GetAccEigenType<DType_FP16>
+{
+    using type = half_float::half;
+};
+template <DType Dtype>
+struct GetAccEigenType
+{
+    using type = typename GetEigenType<Dtype>::type;
+};
+
+template <DType Dtype>
+struct GetHalfEigenType
+{
+    using type = half_float::half;
+};
+
 // Meta function to get number of bits
 template <DType T>
 struct GetNumBits
@@ -155,6 +184,11 @@
 {
     static constexpr int32_t value = 48;
 };
+template <>
+struct GetNumBits<DType_FP16>
+{
+    static constexpr int32_t value = 16;
+};
 
 // Meta function to get quantized min/max in compile time
 template <DType T>
@@ -262,6 +296,11 @@
     static constexpr DType value = DType_INT48;
 };
 template <>
+struct GetAccDType<DType_FP16, DType_FP16>
+{
+    static constexpr DType value = DType_FP16;
+};
+template <>
 struct GetAccDType<DType_FLOAT, DType_FLOAT>
 {
     static constexpr DType value = DType_FLOAT;
diff --git a/reference_model/src/ops/tensor_ops.cc b/reference_model/src/ops/tensor_ops.cc
index 2cd94bb..c617dda 100644
--- a/reference_model/src/ops/tensor_ops.cc
+++ b/reference_model/src/ops/tensor_ops.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #include "tensor_ops.h"
 #include "quant_util.h"
 #include "template_types.h"
+#include "half.hpp"
 
 using namespace TosaReference;
 using namespace Eigen;
@@ -329,8 +330,8 @@
     return GraphNode::eval();
 }
 
-template <DType Dtype>
-OpAvgPool2d<Dtype>::OpAvgPool2d(SubgraphTraverser* sgt_,
+template <DType Dtype, DType AccDtype>
+OpAvgPool2d<Dtype, AccDtype>::OpAvgPool2d(SubgraphTraverser* sgt_,
                                 TosaAttributeBase* attribute_,
                                 uint64_t id_)
     : GraphNode(sgt_, Op_AVG_POOL2D, id_)
@@ -341,15 +342,15 @@
     INIT_ATTRIBUTE(Pool);
 }
 
-template <DType Dtype>
-OpAvgPool2d<Dtype>::~OpAvgPool2d()
+template <DType Dtype, DType AccDtype>
+OpAvgPool2d<Dtype, AccDtype>::~OpAvgPool2d()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType Dtype>
-int OpAvgPool2d<Dtype>::checkTensorAttributes()
+template <DType Dtype, DType AccDtype>
+int OpAvgPool2d<Dtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -385,8 +386,8 @@
 // This calculates the number of padding elements used for each location along an axis
 // Average pooling only divides by the number of elements used, not including padding.
 // This function uses left/right, but is also used for vertical padding with top/bottom
-template <DType Dtype>
-ETensor1<int32_t> OpAvgPool2d<Dtype>::calculate_div_map_1d(int in_size, int out_size, int kernel_size, int stride, int32_t pad_left, int32_t pad_right)
+template <DType Dtype, DType AccDtype>
+ETensor1<int32_t> OpAvgPool2d<Dtype, AccDtype>::calculate_div_map_1d(int in_size, int out_size, int kernel_size, int stride, int32_t pad_left, int32_t pad_right)
 {
     ETensor1<int32_t> result(out_size);
 
@@ -414,8 +415,8 @@
 
 // assuming input and output tensor have same scales like tflite reference
 // so no need to scale input and output
-template <DType Dtype>
-int OpAvgPool2d<Dtype>::eval()
+template <DType Dtype, DType AccDtype>
+int OpAvgPool2d<Dtype, AccDtype>::eval()
 {
     int in_batch    = this->in->getShape()[0];
     int in_height   = this->in->getShape()[1];
@@ -439,11 +440,13 @@
     int stride_h       = this->attribute->stride()[0];
     int stride_w       = this->attribute->stride()[1];
 
+    tosa::DType accum_dtype       = (tosa::DType)this->attribute->accum_dtype();
+
     DEBUG_INFO(OP,
                "perform AvgPool2d, input.shape=[%d,%d,%d,%d], output.shape=[%d,%d,%d,%d], kernel=[%d,%d], "
-               "stride=[%d,%d], pad=[%d,%d,%d,%d]",
+               "stride=[%d,%d], pad=[%d,%d,%d,%d], accum_dtype=%s",
                in_batch, in_height, in_width, in_channels, out_batch, out_height, out_width, out_channels, kernel_h,
-               kernel_w, stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right);
+               kernel_w, stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right, EnumNamesDType()[accum_dtype]);
 
     Eigen::array<Eigen::Index, 2> im2col_input_dims;
     im2col_input_dims[0] = kernel_h * kernel_w;
@@ -509,8 +512,7 @@
             .contract(div_map_w.reshape(Eigen::array<Eigen::Index, 2>{ 1, out_width }), contract_dims)
             .reshape(Eigen::array<Eigen::Index, 4>{ 1, out_height, out_width, 1 })
             .broadcast(bcast);
-
-    if (Dtype != DType_FLOAT)
+    if (Dtype != DType_FLOAT && Dtype != DType_FP16)
     {
         try
         {
@@ -531,14 +533,15 @@
     }
     else
     {
+        // Case for float-type resizes
         this->out->getTensor() = (sum / div_map.template cast<AccEigenType>()).template cast<OutEigenType>();
     }
 
     return GraphNode::eval();
 }
 
-template <DType InDtype, DType WeightDtype>
-OpConv2d<InDtype, WeightDtype>::OpConv2d(SubgraphTraverser* sgt_,
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpConv2d<InDtype, WeightDtype, AccDtype>::OpConv2d(SubgraphTraverser* sgt_,
                                          TosaAttributeBase* attribute_,
                                          uint64_t id_)
     : GraphNode(sgt_, Op_CONV2D, id_)
@@ -549,15 +552,15 @@
     INIT_ATTRIBUTE(Conv);
 }
 
-template <DType InDtype, DType WeightDtype>
-OpConv2d<InDtype, WeightDtype>::~OpConv2d()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpConv2d<InDtype, WeightDtype, AccDtype>::~OpConv2d()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpConv2d<InDtype, WeightDtype>::checkTensorAttributes()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpConv2d<InDtype, WeightDtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -574,12 +577,12 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpConv2d: Output data type not supported for this configuration of operator");
+                "OpConv2d: Output data type not supported for this configuration of operator");
 
     input  = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
     weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
     bias   = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     std::string msg;
     if (check_conv_attribute(attribute, 2 /* conv_dimension */, input->getShape(), output->getShape(),
@@ -593,8 +596,8 @@
     return 0;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpConv2d<InDtype, WeightDtype>::eval()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpConv2d<InDtype, WeightDtype, AccDtype>::eval()
 {
     int in_batch    = this->input->getShape()[0];
     int in_height   = this->input->getShape()[1];
@@ -630,12 +633,14 @@
     int dilation_h     = this->attribute->dilation()[0];
     int dilation_w     = this->attribute->dilation()[1];
 
+    tosa::DType accum_dtype       = (tosa::DType)this->attribute->accum_dtype();
+
     DEBUG_INFO(OP,
                "perform OpConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], output.shape=[%d,%d,%d,%d], "
-               "stride=[%d,%d], dilation=[%d,%d], pad=[%d,%d,%d,%d]",
+               "stride=[%d,%d], dilation=[%d,%d], pad=[%d,%d,%d,%d], accum_dtype=%s",
                in_batch, in_height, in_width, in_channels, f_height, f_width, f_in_channels, f_out_channels, out_batch,
                out_height, out_width, out_channels, stride_h, stride_w, dilation_h, dilation_w, pad_top,
-               pad_bottom, pad_left, pad_right);
+               pad_bottom, pad_left, pad_right, EnumNamesDType()[accum_dtype]);
 
     // GEMM-conv2d, left matrix is input, right matrix is weight
     Eigen::array<Eigen::Index, 2> im2col_input_dims;
@@ -695,33 +700,33 @@
 
     // transpose and reshape weight from [OC, H, W, IC] to [H * W * IC, OC]
     ETensor2<WeightEigenType> im2col_weight =
-        weight_val.shuffle(Eigen::array<Eigen::Index, 4>({ 1, 2, 3, 0 })).reshape(im2col_weight_dims);
+    weight_val.shuffle(Eigen::array<Eigen::Index, 4>({ 1, 2, 3, 0 })).reshape(im2col_weight_dims);
 
     // don't need to apply bias_multiplier ( * bias_scale and >> bias_shift) since tflite already scale it
     // and reshaped from [C] to [1, C], and broadcast to [N * H * W, C]
-    ETensor2<AccEigenType> bias_2d = this->bias->getTensor().reshape(bias_reshaped_dims).broadcast(bias_bcast_dims);
+    ETensor2<OutEigenType> bias_2d = (this->bias->getTensor().reshape(bias_reshaped_dims).broadcast(bias_bcast_dims)).template cast<OutEigenType>();
 
     // output matrix is [N * H * W, C]
-    ETensor2<AccEigenType> contracted_result =
-        im2col_input.template cast<AccEigenType>().contract(im2col_weight.template cast<AccEigenType>(), contract_dims);
+    ETensor2<OutEigenType> contracted_result =
+        (im2col_input.template cast<AccEigenType>().contract(im2col_weight.template cast<AccEigenType>(), contract_dims)).template cast<OutEigenType>();
 
     // adding bias
-    ETensor2<AccEigenType> biased_output = contracted_result + bias_2d.template cast<AccEigenType>();
+    ETensor2<OutEigenType> biased_output = contracted_result + bias_2d;
 
     // reshape back to [N, H, W, C]
     this->output->getTensor() = biased_output.reshape(col2im_output_dims);
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
 
     return GraphNode::eval();
 }
 
-template <DType InDtype, DType WeightDtype>
-OpConv3d<InDtype, WeightDtype>::OpConv3d(SubgraphTraverser* sgt_,
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpConv3d<InDtype, WeightDtype, AccDtype>::OpConv3d(SubgraphTraverser* sgt_,
                                          TosaAttributeBase* attribute_,
                                          uint64_t id_)
     : GraphNode(sgt_, Op_CONV3D, id_)
@@ -732,15 +737,15 @@
     INIT_ATTRIBUTE(Conv);
 }
 
-template <DType InDtype, DType WeightDtype>
-OpConv3d<InDtype, WeightDtype>::~OpConv3d()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpConv3d<InDtype, WeightDtype, AccDtype>::~OpConv3d()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpConv3d<InDtype, WeightDtype>::checkTensorAttributes()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpConv3d<InDtype, WeightDtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -757,12 +762,12 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpConv3d: Output data type not supported for this configuration of operator");
+                "OpConv3d: Output data type not supported for this configuration of operator");
 
     input  = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
     weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
     bias   = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     std::string msg;
     if (check_conv_attribute(attribute, 3 /* conv_dimension */, input->getShape(), output->getShape(),
@@ -776,8 +781,8 @@
     return 0;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpConv3d<InDtype, WeightDtype>::eval()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpConv3d<InDtype, WeightDtype, AccDtype>::eval()
 {
     int in_batch    = this->input->getShape()[0];
     int in_depth    = this->input->getShape()[1];
@@ -821,13 +826,15 @@
     int dilation_h     = this->attribute->dilation()[1];
     int dilation_w     = this->attribute->dilation()[2];
 
+    tosa::DType accum_dtype       = (tosa::DType)this->attribute->accum_dtype();
+
     DEBUG_INFO(
         OP,
         "perform OpConv3d, input.shape=[%d,%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d,%d], output.shape=[%d,%d,%d,%d,%d], "
-        "stride=[%d,%d,%d], dilation=[%d,%d,%d], pad=[%d,%d,%d,%d,%d,%d]",
+        "stride=[%d,%d,%d], dilation=[%d,%d,%d], pad=[%d,%d,%d,%d,%d,%d], accum_dtype=%s",
         in_batch, in_depth, in_height, in_width, in_channels, f_out_channels, f_depth, f_height, f_width, f_in_channels,
         out_batch, out_depth, out_height, out_width, out_channels, stride_d, stride_h, stride_w, dilation_d, dilation_h,
-        dilation_w, pad_d0, pad_d1, pad_top, pad_bottom, pad_left, pad_right);
+        dilation_w, pad_d0, pad_d1, pad_top, pad_bottom, pad_left, pad_right, EnumNamesDType()[accum_dtype]);
 
     Eigen::array<std::pair<int32_t, int32_t>, 5> pad;
     pad[0] = std::make_pair(0, 0);
@@ -860,7 +867,7 @@
     this->output->getTensor() = this->bias->getTensor().reshape(reshape_dim).broadcast(bcast);
 
     // 2. direct convolution
-    AccEigenType acc = 0;
+    AccEigenType acc(0.0);
     int d_idx, h_idx, w_idx;
 
     for (int ob = 0; ob < out_batch; ob++)
@@ -874,7 +881,7 @@
                     for (int oc = 0; oc < out_channels; oc++)
                     {
                         // Initialize accumulator with bias value
-                        acc = this->output->getTensor()(ob, od, oh, ow, oc);
+                        acc = (AccEigenType)this->output->getTensor()(ob, od, oh, ow, oc);
                         for (int fd = 0; fd < f_depth; fd++)
                         {
                             d_idx = od * stride_d + fd * dilation_d;
@@ -892,7 +899,7 @@
                                 }
                             }
                         }
-                        this->output->getTensor()(ob, od, oh, ow, oc) = acc;
+                        this->output->getTensor()(ob, od, oh, ow, oc) = (OutEigenType)acc;
                     }
                 }
             }
@@ -901,15 +908,15 @@
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
 
     return GraphNode::eval();
 }
 
-template <DType InDtype, DType WeightDtype>
-OpDepthwiseConv2d<InDtype, WeightDtype>::OpDepthwiseConv2d(SubgraphTraverser* sgt_,
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpDepthwiseConv2d<InDtype, WeightDtype, AccDtype>::OpDepthwiseConv2d(SubgraphTraverser* sgt_,
                                                            TosaAttributeBase* attribute_,
                                                            uint64_t id_)
     : GraphNode(sgt_, Op_DEPTHWISE_CONV2D, id_)
@@ -920,15 +927,15 @@
     INIT_ATTRIBUTE(Conv);
 }
 
-template <DType InDtype, DType WeightDtype>
-OpDepthwiseConv2d<InDtype, WeightDtype>::~OpDepthwiseConv2d()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpDepthwiseConv2d<InDtype, WeightDtype, AccDtype>::~OpDepthwiseConv2d()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpDepthwiseConv2d<InDtype, WeightDtype>::checkTensorAttributes()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpDepthwiseConv2d<InDtype, WeightDtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -945,12 +952,12 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpDepthwiseConv2d: Output data type not supported for this configuration of operator");
+                "OpDepthwiseConv2d: Output data type not supported for this configuration of operator");
 
     input  = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
     weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
     bias   = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     std::string msg;
     if (check_conv_attribute(attribute, 2 /* conv_dimension */, input->getShape(), output->getShape(),
@@ -964,8 +971,8 @@
     return 0;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpDepthwiseConv2d<InDtype, WeightDtype>::eval()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpDepthwiseConv2d<InDtype, WeightDtype, AccDtype>::eval()
 {
     int in_batch    = this->input->getShape()[0];
     int in_height   = this->input->getShape()[1];
@@ -1002,12 +1009,14 @@
     int dilation_h     = this->attribute->dilation()[0];
     int dilation_w     = this->attribute->dilation()[1];
 
+    tosa::DType accum_dtype       = (tosa::DType)this->attribute->accum_dtype();
+
     DEBUG_INFO(OP,
                "perform OpDepthwiseConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], "
-               "output.shape=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d], pad=[%d,%d,%d,%d]",
+               "output.shape=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d], pad=[%d,%d,%d,%d], accum_dtype=%s",
                in_batch, in_height, in_width, in_channels, f_height, f_width, f_in_channels, f_multiplier, out_batch,
                out_height, out_width, out_channels, stride_h, stride_w, dilation_h, dilation_w, pad_top,
-               pad_bottom, pad_left, pad_right);
+               pad_bottom, pad_left, pad_right, EnumNamesDType()[accum_dtype]);
 
     Eigen::array<std::pair<int32_t, int32_t>, 4> pad;
     pad[0] = std::make_pair(0, 0);
@@ -1061,9 +1070,10 @@
                         {
                             for (int fw = 0; fw < f_width; fw++)
                             {
+                                // Perform multiplication in AccEigenType then cast to OutEigenType
                                 this->output->getTensor()(ob, oh, ow, ic * f_multiplier + cm) +=
-                                    ((AccEigenType)input_extract_patches(ob, fh, fw, ow * out_height + oh, ic) *
-                                     (AccEigenType)weight_val(fh, fw, ic, cm));
+                                (OutEigenType)((AccEigenType)input_extract_patches(ob, fh, fw, ow * out_height + oh, ic) *
+                                (AccEigenType)weight_val(fh, fw, ic, cm));
                             }
                         }
                     }
@@ -1074,15 +1084,15 @@
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
 
     return GraphNode::eval();
 }
 
-template <DType InDtype, DType WeightDtype>
-OpFullyConnected<InDtype, WeightDtype>::OpFullyConnected(SubgraphTraverser* sgt_,
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpFullyConnected<InDtype, WeightDtype, AccDtype>::OpFullyConnected(SubgraphTraverser* sgt_,
                                                          TosaAttributeBase* attribute_,
                                                          uint64_t id_)
     : GraphNode(sgt_, Op_FULLY_CONNECTED, id_)
@@ -1093,15 +1103,15 @@
     INIT_ATTRIBUTE(FullyConnected);
 }
 
-template <DType InDtype, DType WeightDtype>
-OpFullyConnected<InDtype, WeightDtype>::~OpFullyConnected()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpFullyConnected<InDtype, WeightDtype, AccDtype>::~OpFullyConnected()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpFullyConnected<InDtype, WeightDtype>::checkTensorAttributes()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpFullyConnected<InDtype, WeightDtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -1128,9 +1138,9 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpFullyConnected: Output data type not supported for this configuration of operator");
+                "OpFullyConnected: Output data type not supported for this configuration of operator");
 
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     ERROR_IF(InDtype != DType_INT8 && attribute->input_zp() != 0, "OpFullyConnected: Input zeropoint must be zero for non int8_t data");
     ERROR_IF(WeightDtype != DType_INT8 && attribute->weight_zp() != 0, "OpFullyConnected: Weight zeropoint must be zero for non int8_t data");
@@ -1138,8 +1148,8 @@
     return 0;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpFullyConnected<InDtype, WeightDtype>::eval()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpFullyConnected<InDtype, WeightDtype, AccDtype>::eval()
 {
     typedef Eigen::Tensor<int, 1>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims{ { DimPair(1, 0) } };
@@ -1163,19 +1173,19 @@
     }
 
     this->output->getTensor() =
-        input_val.template cast<AccEigenType>().contract(weight_val.template cast<AccEigenType>(), dims) +
-        this->bias->getTensor().reshape(bias_reshape).broadcast(bias_bcast);
+        input_val.template cast<AccEigenType>().contract(weight_val.template cast<AccEigenType>(), dims).template cast<OutEigenType>() +
+            this->bias->getTensor().reshape(bias_reshape).broadcast(bias_bcast);
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
     return GraphNode::eval();
 }
 
-template <DType Dtype>
-OpMatMul<Dtype>::OpMatMul(SubgraphTraverser* sgt_,
+template <DType Dtype, DType AccDtype>
+OpMatMul<Dtype, AccDtype>::OpMatMul(SubgraphTraverser* sgt_,
                           TosaAttributeBase* attribute_,
                           uint64_t id_)
     : GraphNode(sgt_, Op_MATMUL, id_)
@@ -1186,15 +1196,15 @@
     INIT_ATTRIBUTE(MatMul);
 }
 
-template <DType Dtype>
-OpMatMul<Dtype>::~OpMatMul()
+template <DType Dtype, DType AccDtype>
+OpMatMul<Dtype, AccDtype>::~OpMatMul()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType Dtype>
-int OpMatMul<Dtype>::checkTensorAttributes()
+template <DType Dtype, DType AccDtype>
+int OpMatMul<Dtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -1205,11 +1215,11 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpMatMul: Output data type not supported for this configuration of operator");
+                "OpMatMul: Output data type not supported for this configuration of operator");
 
     a      = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
     b      = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[1]);
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     ASSERT_MEM(a && b && output);
 
@@ -1255,8 +1265,8 @@
     return 0;
 }
 
-template <DType Dtype>
-int OpMatMul<Dtype>::eval()
+template <DType Dtype, DType AccDtype>
+int OpMatMul<Dtype, AccDtype>::eval()
 {
     typedef Eigen::Tensor<int, 1>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims{ { DimPair(1, 0) } };
@@ -1289,22 +1299,22 @@
         TInRank2 b_rank2_val = b_val.slice(b_begin_array, b_size_array).reshape(b_rank2_shape);
         TAccRank2 output_rank2_val =
             a_rank2_val.template cast<AccEigenType>().contract(b_rank2_val.template cast<AccEigenType>(), dims);
-        TAcc output_rank3_val = output_rank2_val.reshape(output_rank3_shape);
+        TOut output_rank3_val = output_rank2_val.reshape(output_rank3_shape).template cast<OutEigenType>();
         if (i == 0)
         {
             this->output->getTensor() = output_rank3_val;
         }
         else
         {
-            TAcc temp                 = this->output->getTensor().concatenate(output_rank3_val, 0);
+            TOut temp                 = this->output->getTensor().concatenate(output_rank3_val, 0);
             this->output->getTensor() = temp;
         }
     }
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
 
     return GraphNode::eval();
@@ -1442,8 +1452,8 @@
     return GraphNode::eval();
 }
 
-template <DType InDtype, DType WeightDtype>
-OpTransposeConv2d<InDtype, WeightDtype>::OpTransposeConv2d(SubgraphTraverser* sgt_,
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpTransposeConv2d<InDtype, WeightDtype, AccDtype>::OpTransposeConv2d(SubgraphTraverser* sgt_,
                                                            TosaAttributeBase* attribute_,
                                                            uint64_t id_)
     : GraphNode(sgt_, Op_TRANSPOSE_CONV2D, id_)
@@ -1454,15 +1464,15 @@
     INIT_ATTRIBUTE(TransposeConv);
 }
 
-template <DType InDtype, DType WeightDtype>
-OpTransposeConv2d<InDtype, WeightDtype>::~OpTransposeConv2d()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+OpTransposeConv2d<InDtype, WeightDtype, AccDtype>::~OpTransposeConv2d()
 {
     if (attribute)
         delete attribute;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpTransposeConv2d<InDtype, WeightDtype>::checkTensorAttributes()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpTransposeConv2d<InDtype, WeightDtype, AccDtype>::checkTensorAttributes()
 {
     if (validateRequiredOperands())
         return 1;
@@ -1473,12 +1483,12 @@
     }
 
     ERROR_IF(outputs[0]->getDtype() != AccDtype,
-             "OpTransposeConv2d: Output data type not supported for this configuration of operator");
+                "OpTransposeConv2d: Output data type not supported for this configuration of operator");
 
     input  = dynamic_cast<TosaReference::TensorTemplate<TIn>*>(inputs[0]);
     weight = dynamic_cast<TosaReference::TensorTemplate<TWeight>*>(inputs[1]);
     bias   = dynamic_cast<TosaReference::TensorTemplate<TBias>*>(inputs[2]);
-    output = dynamic_cast<TosaReference::TensorTemplate<TAcc>*>(outputs[0]);
+    output = dynamic_cast<TosaReference::TensorTemplate<TOut>*>(outputs[0]);
 
     if (attribute->out_pad().size() != 4)
     {
@@ -1556,8 +1566,8 @@
     return 0;
 }
 
-template <DType InDtype, DType WeightDtype>
-int OpTransposeConv2d<InDtype, WeightDtype>::eval()
+template <DType InDtype, DType WeightDtype, DType AccDtype>
+int OpTransposeConv2d<InDtype, WeightDtype, AccDtype>::eval()
 {
     int in_batch    = this->input->getShape()[0];
     int in_height   = this->input->getShape()[1];
@@ -1584,6 +1594,8 @@
     int stride_h = this->attribute->stride()[0];
     int stride_w = this->attribute->stride()[1];
 
+    tosa::DType accum_dtype = (tosa::DType)this->attribute->accum_dtype();
+
     ERROR_IF(in_batch != out_batch, "OpTransposeConv2d: tensor batch mismatch %d != %d", in_batch, out_batch);
     ERROR_IF(f_in_channels != in_channels, "OpTransposeConv2d: tensor input channel mismatch %d != %d", f_in_channels,
              in_channels);
@@ -1594,10 +1606,10 @@
 
     DEBUG_INFO(OP,
                "perform OpTransposeConv2d, input.shape=[%d,%d,%d,%d], weight.shape=[%d,%d,%d,%d], "
-               "output.shape=[%d,%d,%d,%d], stride=[%d,%d], out_pad=[%d,%d,%d,%d]",
+               "output.shape=[%d,%d,%d,%d], stride=[%d,%d], out_pad=[%d,%d,%d,%d], accum_dtype=%s",
                in_batch, in_height, in_width, in_channels, f_height, f_width, f_out_channels, f_in_channels,
                out_batch, out_height, out_width, out_channels, stride_h, stride_w, out_pad_top,
-               out_pad_bottom, out_pad_left, out_pad_right);
+               out_pad_bottom, out_pad_left, out_pad_right, EnumNamesDType()[accum_dtype]);
 
     TIn input_val      = this->input->getTensor();
     TWeight weight_val = this->weight->getTensor();
@@ -1645,8 +1657,8 @@
                                 if ((out_x >= 0 && out_x < out_width) && (out_y >= 0 && out_y < out_height))
                                 {
                                     this->output->getTensor()(ob, out_y, out_x, oc) +=
-                                        ((AccEigenType)input_val(ob, ih, iw, ic) *
-                                         (AccEigenType)weight_val(oc, fh, fw, ic));
+                                        (OutEigenType) ((AccEigenType)input_val(ob, ih, iw, ic) *
+                                            (AccEigenType)weight_val(oc, fh, fw, ic));
                                 }
                             }
                         }
@@ -1658,51 +1670,68 @@
 
     if (AccDtype == DType_INT48)
     {
-        this->output->getTensor() = this->output->getTensor().cwiseMax((AccEigenType)AccQMin);
-        this->output->getTensor() = this->output->getTensor().cwiseMin((AccEigenType)AccQMax);
+        this->output->getTensor() = this->output->getTensor().cwiseMax((OutEigenType)AccQMin);
+        this->output->getTensor() = this->output->getTensor().cwiseMin((OutEigenType)AccQMax);
     }
 
     return GraphNode::eval();
 }
 
 // template explicit instantiation
+DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, FP16);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, FLOAT);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, INT8);
 DEF_INSTANTIATE_RANK1_6_ONE_RANK_ONE_TYPE(OpArgMax, INT16);
 
-DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, FLOAT)
-DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, INT8)
-DEF_INSTANTIATE_ONE_TYPE(OpAvgPool2d, INT16)
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, FP16, FP16);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, FP16, FLOAT);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, FLOAT, FLOAT);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, INT8, INT32);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpAvgPool2d, INT16, INT32);
 
-DEF_INSTANTIATE_TWO_TYPE(OpConv2d, FLOAT, FLOAT);
-DEF_INSTANTIATE_TWO_TYPE(OpConv2d, INT8, INT4);
-DEF_INSTANTIATE_TWO_TYPE(OpConv2d, INT8, INT8);
-DEF_INSTANTIATE_TWO_TYPE(OpConv2d, INT16, INT8);
+                                          // [in_t, weight_t, acc_t]
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, FP16, FP16, FP16);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, FP16, FP16, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, FLOAT, FLOAT, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, INT8, INT4, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, INT8, INT8, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv2d, INT16, INT8, INT48);
 
-DEF_INSTANTIATE_TWO_TYPE(OpConv3d, FLOAT, FLOAT);
-DEF_INSTANTIATE_TWO_TYPE(OpConv3d, INT8, INT4);
-DEF_INSTANTIATE_TWO_TYPE(OpConv3d, INT8, INT8);
-DEF_INSTANTIATE_TWO_TYPE(OpConv3d, INT16, INT8);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, FP16, FP16, FP16);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, FP16, FP16, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, FLOAT, FLOAT, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, INT8, INT4, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, INT8, INT8, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpConv3d, INT16, INT8, INT48);
 
-DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, FLOAT, FLOAT);
-DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, INT8, INT4);
-DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, INT8, INT8);
-DEF_INSTANTIATE_TWO_TYPE(OpDepthwiseConv2d, INT16, INT8);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, FP16, FP16, FP16);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, FP16, FP16, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, FLOAT, FLOAT, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, INT8, INT4, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, INT8, INT8, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpDepthwiseConv2d, INT16, INT8, INT48);
 
-DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, FLOAT, FLOAT);
-DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, INT8, INT4);
-DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, INT8, INT8);
-DEF_INSTANTIATE_TWO_TYPE(OpFullyConnected, INT16, INT8);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FP16, FP16, FP16);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FP16, FP16, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, FLOAT, FLOAT, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, INT8, INT4, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, INT8, INT8, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpFullyConnected, INT16, INT8, INT48);
 
-DEF_INSTANTIATE_ONE_TYPE(OpMatMul, INT8);
-DEF_INSTANTIATE_ONE_TYPE(OpMatMul, INT16);
-DEF_INSTANTIATE_ONE_TYPE(OpMatMul, FLOAT);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpMatMul, INT8, INT32);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpMatMul, INT16, INT48);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpMatMul, FP16, FP16);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpMatMul, FP16, FLOAT);
+DEF_INSTANTIATE_ONE_TYPE_ONE_ACCUM(OpMatMul, FLOAT, FLOAT);
 
+DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, FP16);
 DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, FLOAT);
 DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, INT8);
 DEF_INSTANTIATE_ONE_TYPE(OpMaxPool2d, INT16);
 
-DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, FLOAT, FLOAT);
-DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, INT8, INT4);
-DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, INT8, INT8);
-DEF_INSTANTIATE_TWO_TYPE(OpTransposeConv2d, INT16, INT8);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, FP16, FP16, FP16);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, FP16, FP16, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, FLOAT, FLOAT, FLOAT);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, INT8, INT4, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, INT8, INT8, INT32);
+DEF_INSTANTIATE_TWO_TYPE_ONE_ACCUM(OpTransposeConv2d, INT16, INT8, INT48);
diff --git a/reference_model/src/ops/tensor_ops.h b/reference_model/src/ops/tensor_ops.h
index 24eadeb..fd6dd25 100644
--- a/reference_model/src/ops/tensor_ops.h
+++ b/reference_model/src/ops/tensor_ops.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@
     TosaReference::TensorTemplate<TOut>* output;
 };
 
-template <DType Dtype>
+template <DType Dtype, DType AccDtype>
 class OpAvgPool2d : public GraphNode
 {
 public:
@@ -55,9 +55,8 @@
     virtual int checkTensorAttributes();
     virtual int eval();
 
-    static constexpr DType AccDtype = GetAccDType<Dtype, Dtype>::value;
     using InEigenType               = typename GetEigenType<Dtype>::type;
-    using AccEigenType              = typename GetEigenType<AccDtype>::type;
+    using AccEigenType              = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
     using OutEigenType              = typename GetEigenType<Dtype>::type;
     using TIn                       = Eigen::Tensor<InEigenType, 4>;
     using TOut                      = Eigen::Tensor<OutEigenType, 4>;
@@ -75,7 +74,7 @@
     ETensor1<int32_t> calculate_div_map_1d(int in_size, int out_size, int kernel_size, int stride, int32_t padding_left, int32_t padding_right);
 };
 
-template <DType InDtype, DType WeightDtype>
+template <DType InDtype, DType WeightDtype, DType AccDtype>
 class OpConv2d : public GraphNode
 {
 public:
@@ -85,15 +84,14 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype = GetAccDType<InDtype, WeightDtype>::value;
-
     using InEigenType     = typename GetEigenType<InDtype>::type;
     using WeightEigenType = typename GetEigenType<WeightDtype>::type;
-    using AccEigenType    = typename GetEigenType<AccDtype>::type;
+    using AccEigenType    = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType    = typename GetEigenType<AccDtype>::type;
     using TIn             = Eigen::Tensor<InEigenType, 4>;
     using TWeight         = Eigen::Tensor<WeightEigenType, 4>;
-    using TBias           = Eigen::Tensor<AccEigenType, 1>;
-    using TAcc            = Eigen::Tensor<AccEigenType, 4>;
+    using TBias           = Eigen::Tensor<OutEigenType, 1>;
+    using TOut            = Eigen::Tensor<OutEigenType, 4>;
 
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
     static constexpr int64_t AccQMax = GetQMax<AccDtype>::value;
@@ -102,11 +100,11 @@
     TosaReference::TensorTemplate<TIn>* input;
     TosaReference::TensorTemplate<TWeight>* weight;
     TosaReference::TensorTemplate<TBias>* bias;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
     tosa::TosaConvAttribute* attribute;
 };
 
-template <DType InDtype, DType WeightDtype>
+template <DType InDtype, DType WeightDtype, DType AccDtype>
 class OpConv3d : public GraphNode
 {
 public:
@@ -116,15 +114,14 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype = GetAccDType<InDtype, WeightDtype>::value;
-
     using InEigenType     = typename GetEigenType<InDtype>::type;
     using WeightEigenType = typename GetEigenType<WeightDtype>::type;
-    using AccEigenType    = typename GetEigenType<AccDtype>::type;
+    using AccEigenType    = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType    = typename GetEigenType<AccDtype>::type;
     using TIn             = Eigen::Tensor<InEigenType, 5>;
     using TWeight         = Eigen::Tensor<WeightEigenType, 5>;
-    using TBias           = Eigen::Tensor<AccEigenType, 1>;
-    using TAcc            = Eigen::Tensor<AccEigenType, 5>;
+    using TBias           = Eigen::Tensor<OutEigenType, 1>;
+    using TOut            = Eigen::Tensor<OutEigenType, 5>;
 
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
     static constexpr int64_t AccQMax = GetQMax<AccDtype>::value;
@@ -133,11 +130,11 @@
     TosaReference::TensorTemplate<TIn>* input;
     TosaReference::TensorTemplate<TWeight>* weight;
     TosaReference::TensorTemplate<TBias>* bias;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
     tosa::TosaConvAttribute* attribute;
 };
 
-template <DType InDtype, DType WeightDtype>
+template <DType InDtype, DType WeightDtype, DType AccDtype>
 class OpDepthwiseConv2d : public GraphNode
 {
 public:
@@ -147,15 +144,14 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype = GetAccDType<InDtype, WeightDtype>::value;
-
     using InEigenType     = typename GetEigenType<InDtype>::type;
     using WeightEigenType = typename GetEigenType<WeightDtype>::type;
-    using AccEigenType    = typename GetEigenType<AccDtype>::type;
+    using AccEigenType    = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType    = typename GetEigenType<AccDtype>::type;
     using TIn             = Eigen::Tensor<InEigenType, 4>;
     using TWeight         = Eigen::Tensor<WeightEigenType, 4>;
-    using TBias           = Eigen::Tensor<AccEigenType, 1>;
-    using TAcc            = Eigen::Tensor<AccEigenType, 4>;
+    using TBias           = Eigen::Tensor<OutEigenType, 1>;
+    using TOut            = Eigen::Tensor<OutEigenType, 4>;
 
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
     static constexpr int64_t AccQMax = GetQMax<AccDtype>::value;
@@ -164,11 +160,11 @@
     TosaReference::TensorTemplate<TIn>* input;
     TosaReference::TensorTemplate<TWeight>* weight;
     TosaReference::TensorTemplate<TBias>* bias;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
     tosa::TosaConvAttribute* attribute;
 };
 
-template <DType InDtype, DType WeightDtype>
+template <DType InDtype, DType WeightDtype, DType AccDtype>
 class OpFullyConnected : public GraphNode
 {
 public:
@@ -178,14 +174,14 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype = GetAccDType<InDtype, WeightDtype>::value;
     using InEigenType               = typename GetEigenType<InDtype>::type;
     using WeightEigenType           = typename GetEigenType<WeightDtype>::type;
-    using AccEigenType              = typename GetEigenType<AccDtype>::type;
+    using AccEigenType              = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType              = typename GetEigenType<AccDtype>::type;
     using TIn                       = Eigen::Tensor<InEigenType, 2>;
     using TWeight                   = Eigen::Tensor<WeightEigenType, 2>;
-    using TBias                     = Eigen::Tensor<AccEigenType, 1>;
-    using TAcc                      = Eigen::Tensor<AccEigenType, 2>;
+    using TBias                     = Eigen::Tensor<OutEigenType, 1>;
+    using TOut                      = Eigen::Tensor<OutEigenType, 2>;
 
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
     static constexpr int64_t AccQMax = GetQMax<AccDtype>::value;
@@ -194,12 +190,12 @@
     TosaReference::TensorTemplate<TIn>* input;
     TosaReference::TensorTemplate<TWeight>* weight;
     TosaReference::TensorTemplate<TBias>* bias;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
 
     tosa::TosaFullyConnectedAttribute* attribute;
 };
 
-template <DType Dtype>
+template <DType Dtype, DType AccDtype>
 class OpMatMul : public GraphNode
 {
 public:
@@ -209,11 +205,11 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype  = GetAccDType<Dtype, Dtype>::value;
     using InEigenType                = typename GetEigenType<Dtype>::type;
-    using AccEigenType               = typename GetEigenType<AccDtype>::type;
+    using AccEigenType               = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType               = typename GetEigenType<AccDtype>::type;
     using TIn                        = Eigen::Tensor<InEigenType, 3>;
-    using TAcc                       = Eigen::Tensor<AccEigenType, 3>;
+    using TOut                       = Eigen::Tensor<OutEigenType, 3>;
     using TInRank2                   = Eigen::Tensor<InEigenType, 2>;
     using TAccRank2                  = Eigen::Tensor<AccEigenType, 2>;
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
@@ -222,7 +218,7 @@
 protected:
     TosaReference::TensorTemplate<TIn>* a;
     TosaReference::TensorTemplate<TIn>* b;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
     int64_t N;
     int64_t H;
     int64_t W;
@@ -252,7 +248,7 @@
     tosa::TosaPoolAttribute* attribute;
 };
 
-template <DType InDtype, DType WeightDtype>
+template <DType InDtype, DType WeightDtype, DType AccDtype>
 class OpTransposeConv2d : public GraphNode
 {
 public:
@@ -262,15 +258,14 @@
     virtual int checkTensorAttributes() final;
     virtual int eval() final;
 
-    static constexpr DType AccDtype = GetAccDType<InDtype, WeightDtype>::value;
-
     using InEigenType     = typename GetEigenType<InDtype>::type;
     using WeightEigenType = typename GetEigenType<WeightDtype>::type;
-    using AccEigenType    = typename GetEigenType<AccDtype>::type;
+    using AccEigenType    = typename GetAccEigenType<AccDtype>::type; // Note: different from GetEigenType
+    using OutEigenType    = typename GetEigenType<AccDtype>::type;
     using TIn             = Eigen::Tensor<InEigenType, 4>;
     using TWeight         = Eigen::Tensor<WeightEigenType, 4>;
-    using TBias           = Eigen::Tensor<AccEigenType, 1>;
-    using TAcc            = Eigen::Tensor<AccEigenType, 4>;
+    using TBias           = Eigen::Tensor<OutEigenType, 1>;
+    using TOut            = Eigen::Tensor<OutEigenType, 4>;
 
     static constexpr int64_t AccQMin = GetQMin<AccDtype>::value;
     static constexpr int64_t AccQMax = GetQMax<AccDtype>::value;
@@ -279,7 +274,7 @@
     TosaReference::TensorTemplate<TIn>* input;
     TosaReference::TensorTemplate<TWeight>* weight;
     TosaReference::TensorTemplate<TBias>* bias;
-    TosaReference::TensorTemplate<TAcc>* output;
+    TosaReference::TensorTemplate<TOut>* output;
     TosaTransposeConvAttribute* attribute;
 };
 
diff --git a/reference_model/src/ops/type_conversion.cc b/reference_model/src/ops/type_conversion.cc
index 52de2e4..50e710a 100644
--- a/reference_model/src/ops/type_conversion.cc
+++ b/reference_model/src/ops/type_conversion.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "quant_util.h"
 #include "template_types.h"
 #include <cmath>
+#include "half.hpp"
 
 using namespace TosaReference;
 using namespace Eigen;
@@ -287,6 +288,30 @@
 }
 
 template <DType InDtype>
+CastHelper<InDtype, DType_FP16>::CastHelper()
+{
+    fcn = [](InEigenType in) -> float {
+        half_float::half out = half_float::half_cast<half_float::half, InEigenType>(in);  // Cast to half_float
+        return half_float::half_cast<float, half_float::half>(out);  // Cast to float (underlying FP16 EigenType)
+    };
+}
+
+template <DType OutDtype>
+CastHelper<DType_FP16, OutDtype>::CastHelper()
+{
+    // Assuming InEigenType = float.
+    fcn = [](float in) -> OutEigenType {
+        // Perform initial rounding in half-precision then cast back to float
+        half_float::half h = half_float::half_cast<half_float::half, float>(in);
+        h = std::round(h);
+        OutEigenType out = half_float::half_cast<float, half_float::half>(h);
+        out              = std::max<OutEigenType>(out, OutMin);
+        out              = std::min<OutEigenType>(out, OutMax);
+        return out;
+    };
+}
+
+template <DType InDtype>
 CastHelper<InDtype, DType_FLOAT>::CastHelper()
 {
     fcn = [](InEigenType in) -> float {
@@ -313,15 +338,21 @@
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, BOOL);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, INT32);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT8, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, BOOL);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, INT32);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT16, FLOAT);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, BOOL);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, INT16);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, FP16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, INT32, FLOAT);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT8);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT16);
+DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FP16, INT32);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT8);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT16);
 DEF_INSTANTIATE_RANK0_6_ONE_RANK_TWO_TYPE(OpCast, FLOAT, INT32);
diff --git a/reference_model/src/ops/type_conversion.h b/reference_model/src/ops/type_conversion.h
index 53470d1..5f197cf 100644
--- a/reference_model/src/ops/type_conversion.h
+++ b/reference_model/src/ops/type_conversion.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -101,6 +101,42 @@
 };
 
 template <DType InDtype>
+class CastHelper<InDtype, DType_FP16>
+{
+public:
+    using InEigenType  = typename GetEigenType<InDtype>::type;
+    using OutEigenType = typename GetEigenType<DType_FP16>::type;
+    using FcnType      = std::function<OutEigenType(InEigenType)>;
+    CastHelper();
+    const FcnType& get_fcn() const
+    {
+        return fcn;
+    }
+
+private:
+    FcnType fcn;
+};
+
+template <DType OutDtype>
+class CastHelper<DType_FP16, OutDtype>
+{
+public:
+    using InEigenType               = typename GetEigenType<DType_FP16>::type;
+    using OutEigenType              = typename GetEigenType<OutDtype>::type;
+    using FcnType                   = std::function<OutEigenType(InEigenType)>;
+    static constexpr int32_t OutMin = GetQMin<OutDtype>::value;
+    static constexpr int32_t OutMax = GetQMax<OutDtype>::value;
+    CastHelper();
+    const FcnType& get_fcn() const
+    {
+        return fcn;
+    }
+
+private:
+    FcnType fcn;
+};
+
+template <DType InDtype>
 class CastHelper<InDtype, DType_FLOAT>
 {
 public:
diff --git a/reference_model/src/subgraph_traverser.cc b/reference_model/src/subgraph_traverser.cc
index d0cc6cf..386f0e5 100644
--- a/reference_model/src/subgraph_traverser.cc
+++ b/reference_model/src/subgraph_traverser.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -385,6 +385,14 @@
                     tensor->setTensorValueInt64(i64_data.size(), i64_data.data());
                 }
                 break;
+                case DType_FP16:
+                {
+                    // Interpret f16 data as float
+                    std::vector<float> f16_data;
+                    TosaSerializationHandler::ConvertU8toF16(ts->GetData(), tensor->getElementCount(), f16_data);
+                    tensor->setTensorValueFloat(f16_data.size(), f16_data.data());
+                }
+                break;
                 case DType_FLOAT:
                 {
                     std::vector<float> fp32_data;
@@ -693,7 +701,7 @@
             DType dtype = currTensor->getDtype();
 
             // Float-point disallowed
-            if (dtype == DType_FLOAT)
+            if (dtype == DType_FLOAT || dtype == DType_FP16)
             {
                 WARNING("SubgraphTraverser::validateGraph(): TOSA Base Inference profile selected: All floating point "
                         "disabled, but %s tensor %s found\n",
diff --git a/reference_model/src/tensor.cc b/reference_model/src/tensor.cc
index 7cbeb13..cbe12a9 100644
--- a/reference_model/src/tensor.cc
+++ b/reference_model/src/tensor.cc
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 
 #include "tensor.h"
 #include "arith_util.h"
+#include "half.hpp"
 
 using namespace TosaReference;
 using namespace Eigen;
@@ -84,6 +85,7 @@
 {
     uint32_t elements   = getElementCount();
     float* fdatabuf     = nullptr;
+    half_float::half* f16databuf = nullptr;
     int32_t* i32databuf = nullptr;
     int64_t* i64databuf = nullptr;
     bool* bdatabuf      = nullptr;
@@ -97,6 +99,14 @@
 
             nperror = NumpyUtilities::readFromNpyFile(filename, elements, fdatabuf);
             break;
+        case DType_FP16:
+            f16databuf = (half_float::half*)calloc(sizeof(half_float::half), elements);
+            ASSERT_MEM(f16databuf);
+            fdatabuf = (float*)calloc(sizeof(float), elements);
+            ASSERT_MEM(fdatabuf);
+
+            nperror = NumpyUtilities::readFromNpyFile(filename, elements, f16databuf);
+            break;
         case DType_INT32:
         case DType_UINT8:
         case DType_INT4:
@@ -146,9 +156,17 @@
 
     switch (getDtype())
     {
+        case DType_FP16:
+            // Convert from fp16 to fp32
+            for (uint32_t i=0; i < elements; i++) {
+                fdatabuf[i] = half_float::half_cast<float, half_float::half>(f16databuf[i]);
+            }
+            // Fall through to DType_FLOAT case
         case DType_FLOAT:
             if (setTensorValueFloat(elements, fdatabuf))
             {
+                if (f16databuf)
+                    free(f16databuf);
                 free(fdatabuf);
                 return 1;
             }
@@ -187,6 +205,8 @@
 
     if (fdatabuf)
         free(fdatabuf);
+    if (f16databuf)
+        free(f16databuf);
     if (i32databuf)
         free(i32databuf);
     if (i64databuf)
@@ -200,11 +220,12 @@
 int TosaReference::Tensor::writeToNpyFile(const char* filename) const
 {
     float* fdatabuf     = nullptr;
+    half_float::half* f16databuf  = nullptr;
     int32_t* i32databuf = nullptr;
     int64_t* i64databuf = nullptr;
     bool* bdatabuf      = nullptr;
     NumpyUtilities::NPError nperror;
-    int elements = getElementCount();
+    uint32_t elements = getElementCount();
 
     switch (getDtype())
     {
@@ -222,6 +243,27 @@
 
             free(fdatabuf);
             break;
+        case DType_FP16:
+            fdatabuf = (float*)calloc(sizeof(float), elements);
+            ASSERT_MEM(fdatabuf);
+            f16databuf = (half_float::half*)calloc(sizeof(half_float::half), elements);
+            ASSERT_MEM(f16databuf);
+
+            if (getTensorValueFloat(elements, fdatabuf))
+            {
+                free(fdatabuf);
+                free(f16databuf);
+                return 1;
+            }
+            // Convert fp32 to fp16
+            for (uint32_t i=0; i < elements; i++) {
+                f16databuf[i] = half_float::half_cast<half_float::half, float>(fdatabuf[i]);
+            }
+            nperror = NumpyUtilities::writeToNpyFile(filename, shape, f16databuf);
+
+            free(fdatabuf);
+            free(f16databuf);
+            break;
         case DType_INT32:
         case DType_UINT8:
         case DType_INT4:
diff --git a/reference_model/src/tensor.h b/reference_model/src/tensor.h
index 6b7d5f1..78a210e 100644
--- a/reference_model/src/tensor.h
+++ b/reference_model/src/tensor.h
@@ -1,5 +1,5 @@
 
-// Copyright (c) 2020-2021, ARM Limited.
+// Copyright (c) 2020-2022, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
@@ -643,6 +643,7 @@
         switch (tensorDtype_)
         {
             case DType_FLOAT:
+            case DType_FP16:
                 switch (rank)
                 {
                     case 0:
diff --git a/scripts/json2numpy/json2numpy.py b/scripts/json2numpy/json2numpy.py
index 21b1acd..c04013e 100644
--- a/scripts/json2numpy/json2numpy.py
+++ b/scripts/json2numpy/json2numpy.py
@@ -16,8 +16,10 @@
         """Encode default operation."""
         if isinstance(obj, np.integer):
             return int(obj)
-        elif isinstance(obj, np.floating):
+        elif isinstance(obj, np.float32):
             return float(obj)
+        elif isinstance(obj, np.float16):
+            return np.float16(obj)
         elif isinstance(obj, np.ndarray):
             return obj.tolist()
         return super(NumpyArrayEncoder, self).default(obj)
diff --git a/thirdparty/serialization_lib b/thirdparty/serialization_lib
index c92710d..485a11d 160000
--- a/thirdparty/serialization_lib
+++ b/thirdparty/serialization_lib
@@ -1 +1 @@
-Subproject commit c92710d7259558fb0cd9e9b38d0c78da21c6e2d4
+Subproject commit 485a11d8cb67c8062c632f0987cd31cedbe93d6d
diff --git a/verif/checker/tosa_result_checker.py b/verif/checker/tosa_result_checker.py
index 66864c2..8ae3218 100644
--- a/verif/checker/tosa_result_checker.py
+++ b/verif/checker/tosa_result_checker.py
@@ -147,14 +147,14 @@
         tolerance = 0.0
         # Fall-through to below to add failure values
 
-    elif reference_result.dtype == np.float32:
+    # TODO: update for fp16 tolerance
+    elif reference_result.dtype == np.float32 or reference_result.dtype == np.float16:
         tolerance = float_tolerance
         if np.allclose(reference_result, test_result, atol=tolerance, equal_nan=True):
             print_color(LogColors.GREEN, "Results PASS {}".format(test_name))
             return (TestResult.PASS, tolerance, "")
         msg = "Float result does not match within tolerance of {}".format(tolerance)
         # Fall-through to below to add failure values
-
     else:
         print_color(LogColors.RED, "Results UNSUPPORTED TYPE {}".format(test_name))
         msg = "Unsupported results type: {}".format(reference_result.dtype)
diff --git a/verif/generator/tosa_arg_gen.py b/verif/generator/tosa_arg_gen.py
index a65e220..69968d3 100644
--- a/verif/generator/tosa_arg_gen.py
+++ b/verif/generator/tosa_arg_gen.py
@@ -2,10 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 import itertools
 import math
+import warnings
 
 import numpy as np
 from generator.tosa_error_if import ErrorIf
 from generator.tosa_error_if import TosaErrorIfArgGen
+from generator.tosa_utils import get_accum_dtype_from_tgTypes
+from generator.tosa_utils import get_wrong_output_type
 from generator.tosa_utils import MAX_RESIZE_DIMENSION
 from serializer.tosa_serializer import DTypeNames
 from tosa.DType import DType
@@ -773,7 +776,7 @@
             ), "Op.MUL must have 2 placeholders, 0 consts"
 
             tens = []
-            if dtypeList[0] == DType.FLOAT:
+            if dtypeList[0] in (DType.FP16, DType.FLOAT):
                 tens.extend(testGen.buildPlaceholderTensors(shapeList[:], dtypeList[:]))
             else:
                 placeholders = []
@@ -982,7 +985,7 @@
         return axes
 
     @staticmethod
-    def agConv(testGen, opName, shapeList, dtype, error_name=None):
+    def agConv(testGen, opName, shapeList, dtypes, error_name=None):
         arg_list = []
 
         ifm_shape = shapeList[0]
@@ -990,6 +993,8 @@
         # determine the kernel shape from operator name (e.g. "conv2d_3x3" => [3,3])
         k = [int(x) for x in opName.split("_")[-1].split("x")]
 
+        accum_dtype = get_accum_dtype_from_tgTypes(dtypes)
+
         # Check the rank
         rank = 5 if opName.startswith("conv3d") else 4
         if error_name != ErrorIf.WrongRank:
@@ -1089,12 +1094,13 @@
                         ):
                             arg_list.append(
                                 (
-                                    "st{}_pad{}_dilat{}".format(
+                                    "acc{}_st{}_pad{}_dilat{}".format(
+                                        testGen.typeStr(accum_dtype),
                                         "".join([str(x) for x in s]),
                                         "".join([str(x) for x in p]),
                                         "".join([str(x) for x in d]),
                                     ),
-                                    [s, p, d],
+                                    [accum_dtype, s, p, d],
                                 )
                             )
                     n += 1
@@ -1102,12 +1108,55 @@
         return arg_list
 
     @staticmethod
-    def agTransposeConv2D(testGen, opName, shapeList, dtype, error_name=None):
+    def agFullyConnected(testGen, opName, shapeList, dtypes, error_name=None):
+
+        if isinstance(dtypes, list) or isinstance(dtypes, tuple):
+            input_dtype = dtypes[0]
+        else:
+            input_dtype = dtypes
+
+        if error_name == ErrorIf.WrongOutputType:
+            accum_dtype = get_wrong_output_type(opName, testGen.rng, input_dtype)
+        elif error_name == ErrorIf.WrongInputType:
+            # Pick some potentially correct output dtype if input type is incorrect
+            accum_dtype = DType.INT32
+        else:
+            accum_dtype = get_accum_dtype_from_tgTypes(dtypes)
+
+        return [(f"acc{testGen.typeStr(accum_dtype)}", [accum_dtype])]
+
+    @staticmethod
+    def agMatMul(testGen, opName, shapeList, dtype, error_name=None):
+        # Get valid accumulate type(s)
+        if dtype == DType.INT8:
+            accum_dtypes = [DType.INT32]
+        elif dtype == DType.INT16:
+            accum_dtypes = [DType.INT48]
+        elif dtype == DType.FP16:
+            accum_dtypes = [DType.FP16, DType.FLOAT]
+        elif dtype == DType.FLOAT:
+            accum_dtypes = [DType.FLOAT]
+        elif error_name is None:
+            assert False, f"Invalid I/O DType for MatMul: {DTypeNames[dtype]}"
+
+        if error_name == ErrorIf.WrongOutputType:
+            # Get incorrect output dtype for ErrorIf case
+            accum_dtypes = [get_wrong_output_type(opName, testGen.rng, dtype)]
+        elif error_name == ErrorIf.WrongInputType:
+            # Pick some potentially correct output dtype if input type is incorrect
+            accum_dtypes = [DType.INT32]
+
+        return [(f"acc{testGen.typeStr(a)}", [a]) for a in accum_dtypes]
+
+    @staticmethod
+    def agTransposeConv2D(testGen, opName, shapeList, dtypes, error_name=None):
         arg_list = []
 
         ifm_shape = shapeList[0]
         filter_shape = shapeList[1]
 
+        accum_dtype = get_accum_dtype_from_tgTypes(dtypes)
+
         # Must be rank 4
         if error_name != ErrorIf.WrongRank:
             assert len(ifm_shape) == 4
@@ -1169,12 +1218,13 @@
                     os = [ifm_shape[0], oh, ow, filter_shape[0]]
                     arg_list.append(
                         (
-                            "st{}_pad{}_os{}".format(
+                            "acc{}_st{}_pad{}_os{}".format(
+                                testGen.typeStr(accum_dtype),
                                 "".join([str(x) for x in s]),
                                 "".join([str(x) for x in p]),
                                 "x".join([str(x) for x in os]),
                             ),
-                            [s, p, os],
+                            [accum_dtype, s, p, os],
                         )
                     )
                 n += 1
@@ -1199,18 +1249,38 @@
         if dtype in [DType.BOOL, DType.INT8, DType.INT16, DType.INT32]:
             pad_const_int = testGen.getRandNumberDType(dtype)
             pad_const_fp = 0
-        elif dtype == DType.FLOAT:
+        elif dtype in (DType.FP16, DType.FLOAT):
             pad_const_int = 0
             pad_const_fp = testGen.getRandNumberDType(dtype)
         else:
             return []
 
         for paddings in shape_pad_values:
-            name = "pad"
-            for r in range(rank):
-                before, after = paddings[r]
-                name = f"{name}{before}{after}"
-            arg_list.append((name, [np.array(paddings), pad_const_int, pad_const_fp]))
+            paddings = list(paddings)
+            args_valid = True
+
+            if error_name == ErrorIf.PadSmallerZero:
+                # Prevent negative output shapes while ensuring still testing for negative padding
+                for i in range(rank):
+                    dim_after_padding = (
+                        paddings[i][0] + paddings[i][1] + shapeList[0][i]
+                    )
+                    if dim_after_padding < 1:
+                        paddings[i] = (0, 0)
+                if all([p > -1 for p in paddings[i]]):
+                    args_valid = False
+
+            if args_valid:
+                name = "pad"
+                for r in range(rank):
+                    before, after = paddings[r]
+                    name = f"{name}{before}{after}"
+                arg_list.append(
+                    (name, [np.array(paddings), pad_const_int, pad_const_fp])
+                )
+
+        if error_name == ErrorIf.PadSmallerZero and len(arg_list) == 0:
+            warnings.warn(f"No ErrorIf test created for input shape: {shapeList[0]}")
 
         return arg_list
 
@@ -1232,6 +1302,21 @@
         k_vals = [x for x in range(2, testGen.args.max_pooling_kernel + 1)]
         kernels = {x for x in itertools.product(*([k_vals] * 2))}
 
+        if opName == "max_pool2d":
+            accum_dtypes = [None]  # max_pool has no accumulate dtype
+        elif dtype == DType.INT8 or dtype == DType.INT16:
+            accum_dtypes = [DType.INT32]
+        elif dtype == DType.FP16:
+            accum_dtypes = [DType.FP16, DType.FLOAT]
+        elif dtype == DType.FLOAT:
+            accum_dtypes = [DType.FLOAT]
+        elif error_name is None:
+            assert False, f"Invalid I/O DType for pooling: {DTypeNames[dtype]}"
+        else:
+            # Set to something for the ErrorIf case which has
+            # incorrect input data-type
+            accum_dtypes = [DType.INT32]
+
         if testGen.args.oversize:
             # add some oversize argument values
             bigStride = 7
@@ -1252,63 +1337,70 @@
         sparsity_factor = 2 if error_name else 500
         sparsity = len(paddings) * len(strides) * len(kernels) // sparsity_factor + 1
 
+        arg_str = (
+            "acc{}_st{}_kern{}_pad{}"
+            if accum_dtypes[0] is not None
+            else "st{}_kern{}_pad{}"
+        )
+
+        def get_arg_list_element(accum, stride, pad, kern):
+            # Return tuple containing the formatted argument string and
+            # the corresponding argument values
+            arg_str_elems = [
+                "".join([str(x) for x in stride]),
+                "".join([str(x) for x in kern]),
+                "".join([str(x) for x in pad]),
+            ]
+            # Note: different order to string
+            arg_val_elems = [stride, pad, kern]
+
+            if accum is not None:
+                arg_str_elems.insert(0, testGen.typeStr(accum))
+                arg_val_elems.insert(0, accum)
+            return (arg_str.format(*arg_str_elems), arg_val_elems)
+
         n = 0
-        for s in sorted(list(strides)):
-            for p in sorted(list(paddings)):
-                for k in sorted(list(kernels)):
-                    if error_name in [
-                        ErrorIf.StrideSmallerOne,
-                        ErrorIf.KernelSmallerOne,
-                        ErrorIf.PadSmallerZero,
-                        ErrorIf.PadLargerEqualKernel,
-                    ]:
-                        sNew, pNew, kNew = TosaErrorIfArgGen.eiPoolingErrorIf(
-                            testGen, error_name, s, p, k
-                        )
-                        if None not in [sNew, pNew, kNew] and n % sparsity == 0:
-                            arg_list.append(
-                                (
-                                    "st{}_kern{}_pad{}".format(
-                                        "".join([str(x) for x in sNew]),
-                                        "".join([str(x) for x in kNew]),
-                                        "".join([str(x) for x in pNew]),
-                                    ),
-                                    [sNew, pNew, kNew],
-                                )
+        for a in accum_dtypes:
+            for s in sorted(list(strides)):
+                for p in sorted(list(paddings)):
+                    for k in sorted(list(kernels)):
+                        if error_name in [
+                            ErrorIf.StrideSmallerOne,
+                            ErrorIf.KernelSmallerOne,
+                            ErrorIf.PadSmallerZero,
+                            ErrorIf.PadLargerEqualKernel,
+                        ]:
+                            sNew, pNew, kNew = TosaErrorIfArgGen.eiPoolingErrorIf(
+                                testGen, error_name, s, p, k
                             )
-                    elif (
-                        n % sparsity == 0
-                        # padding must not exceed the kernel size
-                        and p[0] < k[0]
-                        and p[1] < k[0]
-                        and p[2] < k[1]
-                        and p[3] < k[1]
-                        # the padded shape must exceed the kernel size
-                        and (shape[1] + p[0] + p[1]) > k[0]
-                        and (shape[2] + p[2] + p[3]) > k[1]
-                    ):
-                        remainder_h = (shape[1] + p[0] + p[1] - k[0]) % s[0]
-                        remainder_w = (shape[2] + p[2] + p[3] - k[1]) % s[1]
-                        if (
-                            # the parameters must produce integer exact output
-                            error_name != ErrorIf.PoolingOutputShapeNonInteger
-                            and remainder_h == 0
-                            and remainder_w == 0
-                        ) or (
-                            error_name == ErrorIf.PoolingOutputShapeNonInteger
-                            and (remainder_h != 0 or remainder_w != 0)
+                            if None not in [sNew, pNew, kNew] and n % sparsity == 0:
+                                arg_vals = [a, sNew, pNew, kNew]
+                                arg_list.append(get_arg_list_element(*arg_vals))
+                        elif (
+                            n % sparsity == 0
+                            # padding must not exceed the kernel size
+                            and p[0] < k[0]
+                            and p[1] < k[0]
+                            and p[2] < k[1]
+                            and p[3] < k[1]
+                            # the padded shape must exceed the kernel size
+                            and (shape[1] + p[0] + p[1]) > k[0]
+                            and (shape[2] + p[2] + p[3]) > k[1]
                         ):
-                            arg_list.append(
-                                (
-                                    "st{}_kern{}_pad{}".format(
-                                        "".join([str(x) for x in s]),
-                                        "".join([str(x) for x in k]),
-                                        "".join([str(x) for x in p]),
-                                    ),
-                                    [s, p, k],
-                                )
-                            )
-                    n += 1
+                            remainder_h = (shape[1] + p[0] + p[1] - k[0]) % s[0]
+                            remainder_w = (shape[2] + p[2] + p[3] - k[1]) % s[1]
+                            if (
+                                # the parameters must produce integer exact output
+                                error_name != ErrorIf.PoolingOutputShapeNonInteger
+                                and remainder_h == 0
+                                and remainder_w == 0
+                            ) or (
+                                error_name == ErrorIf.PoolingOutputShapeNonInteger
+                                and (remainder_h != 0 or remainder_w != 0)
+                            ):
+                                arg_vals = [a, s, p, k]
+                                arg_list.append(get_arg_list_element(*arg_vals))
+                        n += 1
 
         return arg_list
 
@@ -1327,6 +1419,8 @@
             dtypeList = [DType.BOOL, DType.INT8, DType.INT16, DType.FLOAT]
         elif inDtype == DType.BOOL:
             dtypeList = [DType.INT8, DType.INT16, DType.INT32]
+        elif inDtype == DType.FP16:
+            dtypeList = [DType.INT8, DType.INT16, DType.INT32]
         elif inDtype == DType.FLOAT:
             dtypeList = [DType.INT8, DType.INT16, DType.INT32]
         elif error_name == ErrorIf.WrongInputType:
@@ -1734,6 +1828,8 @@
                 outputDTypeList = [DType.INT32]
             elif mode == ResizeMode.BILINEAR and dtype == DType.INT16:
                 outputDTypeList = [DType.INT48]
+            elif dtype == DType.FP16:
+                outputDTypeList = [DType.FP16]
             elif dtype == DType.FLOAT:
                 outputDTypeList = [DType.FLOAT]
             elif error_name == ErrorIf.WrongInputType:
diff --git a/verif/generator/tosa_error_if.py b/verif/generator/tosa_error_if.py
index f9a00f9..a766803 100644
--- a/verif/generator/tosa_error_if.py
+++ b/verif/generator/tosa_error_if.py
@@ -120,6 +120,7 @@
                     DType.INT32,
                     DType.INT48,
                     DType.FLOAT,
+                    DType.FP16,
                 )
             elif mode == ResizeMode.NEAREST and dtype == DType.INT16:
                 incorrect_types = (
@@ -128,6 +129,7 @@
                     DType.INT32,
                     DType.INT48,
                     DType.FLOAT,
+                    DType.FP16,
                 )
             elif mode == ResizeMode.BILINEAR and dtype == DType.INT8:
                 incorrect_types = (
@@ -136,6 +138,7 @@
                     DType.INT16,
                     DType.INT48,
                     DType.FLOAT,
+                    DType.FP16,
                 )
             elif mode == ResizeMode.BILINEAR and dtype == DType.INT16:
                 incorrect_types = (
@@ -144,6 +147,16 @@
                     DType.INT16,
                     DType.INT32,
                     DType.FLOAT,
+                    DType.FP16,
+                )
+            elif dtype == DType.FP16:
+                incorrect_types = (
+                    DType.INT4,
+                    DType.INT8,
+                    DType.INT16,
+                    DType.INT32,
+                    DType.INT48,
+                    DType.FLOAT,
                 )
             elif dtype == DType.FLOAT:
                 incorrect_types = (
@@ -152,6 +165,7 @@
                     DType.INT16,
                     DType.INT32,
                     DType.INT48,
+                    DType.FP16,
                 )
             outputDType = testGen.rng.choice(a=incorrect_types)
 
@@ -285,8 +299,8 @@
 
     @staticmethod
     def eiCastErrorIf(testGen, input_dtype):
-        if input_dtype in [DType.BOOL, DType.FLOAT]:
-            outputDType = [DType.BOOL, DType.INT48, DType.FLOAT]
+        if input_dtype in [DType.BOOL, DType.FP16, DType.FLOAT]:
+            outputDType = [DType.BOOL, DType.INT48, DType.FP16, DType.FLOAT]
         elif input_dtype in [DType.INT8, DType.INT16, DType.INT32]:
             outputDType = [DType.INT48]
         else:
@@ -400,6 +414,7 @@
                         and input_dtype == DType.INT16
                         and output_dtype != DType.INT48
                     )
+                    or (input_dtype == DType.FP16 and output_dtype != DType.FP16)
                     or (input_dtype == DType.FLOAT and output_dtype != DType.FLOAT)
                 ):
                     error_result = True
@@ -413,19 +428,28 @@
                 if (
                     (input_dtype == DType.INT8 and output_dtype != DType.INT32)
                     or (input_dtype == DType.INT16 and output_dtype != DType.INT48)
+                    or (
+                        input_dtype == DType.FP16
+                        and output_dtype not in (DType.FP16, DType.FLOAT)
+                    )
                     or (input_dtype == DType.FLOAT and output_dtype != DType.FLOAT)
                 ):
                     error_result = True
 
             elif op["op"] == Op.ARGMAX:
                 if (
-                    input_dtype in [DType.INT8, DType.INT16, DType.FLOAT]
+                    input_dtype in [DType.INT8, DType.INT16, DType.FP16, DType.FLOAT]
                     and output_dtype != DType.INT32
                 ):
                     error_result = True
 
             elif op["op"] == Op.MUL:
-                if input_dtype != DType.FLOAT and output_dtype != DType.INT32:
+                if (
+                    input_dtype not in (DType.FP16, DType.FLOAT)
+                    and output_dtype != DType.INT32
+                ):
+                    error_result = True
+                elif input_dtype == DType.FP16 and output_dtype != DType.FP16:
                     error_result = True
                 elif input_dtype == DType.FLOAT and output_dtype != DType.FLOAT:
                     error_result = True
@@ -449,17 +473,39 @@
                     or (
                         input_dtype == DType.INT8
                         and output_dtype
-                        not in [DType.BOOL, DType.INT16, DType.INT32, DType.FLOAT]
+                        not in [
+                            DType.BOOL,
+                            DType.INT16,
+                            DType.INT32,
+                            DType.FLOAT,
+                            DType.FP16,
+                        ]
                     )
                     or (
                         input_dtype == DType.INT16
                         and output_dtype
-                        not in [DType.BOOL, DType.INT8, DType.INT32, DType.FLOAT]
+                        not in [
+                            DType.BOOL,
+                            DType.INT8,
+                            DType.INT32,
+                            DType.FLOAT,
+                            DType.FP16,
+                        ]
                     )
                     or (
                         input_dtype == DType.INT32
                         and output_dtype
-                        not in [DType.BOOL, DType.INT8, DType.INT16, DType.FLOAT]
+                        not in [
+                            DType.BOOL,
+                            DType.INT8,
+                            DType.INT16,
+                            DType.FLOAT,
+                            DType.FP16,
+                        ]
+                    )
+                    or (
+                        input_dtype == DType.FP16
+                        and output_dtype not in [DType.INT8, DType.INT16, DType.INT32]
                     )
                     or (
                         input_dtype == DType.FLOAT
@@ -479,6 +525,8 @@
                     and output_dtype != DType.INT32
                     or input_dtype == DType.INT16
                     and output_dtype != DType.INT48
+                    or input_dtype == DType.FP16
+                    and output_dtype not in (DType.FP16, DType.FLOAT)
                     or input_dtype == DType.FLOAT
                     and output_dtype != DType.FLOAT
                 ):
@@ -2257,12 +2305,13 @@
             return (
                 not (input_dtype == DType.INT8 and output_dtype == DType.INT32)
                 and not (input_dtype == DType.INT16 and output_dtype == DType.INT48)
+                and not (input_dtype == DType.FP16 and output_dtype == DType.FP16)
                 and not (input_dtype == DType.FLOAT and output_dtype == DType.FLOAT)
             )
         elif mode == ResizeMode.NEAREST:
             # Invalid output data type / Invalid input datatype
             return (input_dtype != output_dtype) or (
-                input_dtype not in [DType.INT8, DType.INT16, DType.FLOAT]
+                input_dtype not in [DType.INT8, DType.INT16, DType.FP16, DType.FLOAT]
             )
         else:
             # Invalid resize mode
@@ -2276,8 +2325,11 @@
         input_shape = inputShapes[0]
 
         args = kwargs["args"]
-        strides = args[0]
-        padding = args[1]
+
+        # MaxPool2D has no accum_dtype arg
+        stride_idx, pad_idx = (0, 1) if opName == "max_pool2d" else (1, 2)
+        strides = args[stride_idx]
+        padding = args[pad_idx]
 
         if opName.endswith("pool2d"):
             # avg_pool2d, max_pool2d
@@ -2365,7 +2417,7 @@
     @staticmethod
     def ivNonPositiveOutputShape(**kwargs):
         args = kwargs["args"]
-        output_shape = args[2]
+        output_shape = args[3]
         if output_shape[1] <= 0 or output_shape[2] <= 0:
             # Negative output shape
             return True
diff --git a/verif/generator/tosa_test_gen.py b/verif/generator/tosa_test_gen.py
index b76b656..9ff6ec5 100644
--- a/verif/generator/tosa_test_gen.py
+++ b/verif/generator/tosa_test_gen.py
@@ -81,6 +81,8 @@
             return np.int64(
                 self.rng.integers(low=-(1 << 47), high=(1 << 47), size=shape)
             )
+        elif dtype == DType.FP16:
+            return np.float16(self.rng.random(size=shape))
         elif dtype == DType.FLOAT:
             return np.float32(self.rng.random(size=shape))
         else:
@@ -128,6 +130,9 @@
     def getRandNumberDType(self, dtype):
         if dtype == DType.FLOAT:
             return self.rng.random()
+        elif dtype == DType.FP16:
+            rand_f32 = self.rng.random()
+            return np.float16(rand_f32)
         elif dtype == DType.BOOL:
             return self.rng.choice([False, True])
         # TOSA specific INT4 weight range from -7 to 7
@@ -178,13 +183,15 @@
                 return "i32"
             elif t == DType.INT48:
                 return "i48"
+            elif t == DType.FP16:
+                return "f16"
             elif t == DType.FLOAT:
                 return "float"
             else:
                 raise Exception("Unknown dtype, cannot convert to string: {}".format(t))
 
     def typeWidth(self, t):
-        """Get the datatype width for integer types"""
+        """Get the datatype width for data types"""
         if t == DType.INT4:
             return 4
         elif t == DType.INT8:
@@ -199,6 +206,8 @@
             return 32
         elif t == DType.INT48:
             return 48
+        elif t == DType.FP16:
+            return 16
         elif t == DType.FLOAT:
             return 32
         elif t == DType.BOOL:
@@ -346,7 +355,7 @@
 
         # Special for multiply:
         # Force the result to INT32 for INT types
-        if a.dtype != DType.FLOAT:
+        if a.dtype not in (DType.FP16, DType.FLOAT):
             result_tens.setDtype(DType.INT32)
         if error_name == ErrorIf.WrongOutputType:
             all_dtypes = [DType.INT8, DType.INT16, DType.INT48]
@@ -533,6 +542,7 @@
         self,
         op,
         input,
+        accum_dtype,
         stride,
         pad,
         kernel,
@@ -585,17 +595,43 @@
             qinfo = [0, 0]
 
         attr = ts.TosaSerializerAttribute()
-        attr.PoolAttribute(kernel, stride, pad, qinfo[0], qinfo[1])
+        attr.PoolAttribute(kernel, stride, pad, qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
 
+    def build_maxpool2d(
+        self,
+        op,
+        input,
+        stride,
+        pad,
+        kernel,
+        validator_fcns=None,
+        error_name=None,
+        qinfo=None,
+    ):
+        # Same as build_pool2d but manually sets accum_dtype value
+        # (maxpool has no accum_dtype)
+        return self.build_pool2d(
+            op,
+            input,
+            DType.UNKNOWN,
+            stride,
+            pad,
+            kernel,
+            validator_fcns,
+            error_name,
+            qinfo,
+        )
+
     def build_conv2d(
         self,
         op,
         ifm,
         filter,
         bias,
+        accum_dtype,
         strides,
         padding,
         dilations,
@@ -605,7 +641,15 @@
     ):
         assert len(padding) == 4
         result_tens = OutputShaper.conv2dOp(
-            self.ser, self.rng, ifm, filter, strides, padding, dilations, error_name
+            self.ser,
+            self.rng,
+            ifm,
+            filter,
+            accum_dtype,
+            strides,
+            padding,
+            dilations,
+            error_name,
         )
 
         # Ensure new output type has correct qinfo
@@ -648,7 +692,7 @@
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1])
+        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
@@ -659,6 +703,7 @@
         ifm,
         filter,
         bias,
+        accum_dtype,
         strides,
         padding,
         dilations,
@@ -668,7 +713,15 @@
     ):
         assert len(padding) == 6
         result_tens = OutputShaper.conv3dOp(
-            self.ser, self.rng, ifm, filter, strides, padding, dilations, error_name
+            self.ser,
+            self.rng,
+            ifm,
+            filter,
+            accum_dtype,
+            strides,
+            padding,
+            dilations,
+            error_name,
         )
 
         # Ensure new output type has correct qinfo
@@ -711,7 +764,7 @@
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1])
+        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
@@ -722,6 +775,7 @@
         ifm,
         filter,
         bias,
+        accum_dtype,
         stride,
         out_pad,
         output_shape,
@@ -731,7 +785,7 @@
     ):
         assert len(out_pad) == 4
         result_tens = OutputShaper.transposeConv2DOp(
-            self.ser, self.rng, ifm, output_shape, error_name
+            self.ser, self.rng, ifm, output_shape, accum_dtype, error_name
         )
 
         # Ensure new output type has correct qinfo
@@ -773,7 +827,9 @@
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.TransposeConvAttribute(out_pad, stride, output_shape, qinfo[0], qinfo[1])
+        attr.TransposeConvAttribute(
+            out_pad, stride, output_shape, qinfo[0], qinfo[1], accum_dtype
+        )
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
@@ -784,6 +840,7 @@
         ifm,
         filter,
         bias,
+        accum_dtype,
         strides,
         padding,
         dilations,
@@ -792,7 +849,15 @@
         qinfo=None,
     ):
         result_tens = OutputShaper.depthwiseConv2dOp(
-            self.ser, self.rng, ifm, filter, strides, padding, dilations, error_name
+            self.ser,
+            self.rng,
+            ifm,
+            filter,
+            accum_dtype,
+            strides,
+            padding,
+            dilations,
+            error_name,
         )
 
         # Ensure new output type has correct qinfo
@@ -835,16 +900,24 @@
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1])
+        attr.ConvAttribute(padding, strides, dilations, qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
 
     def build_fully_connected(
-        self, op, ifm, filter, bias, validator_fcns=None, error_name=None, qinfo=None
+        self,
+        op,
+        ifm,
+        filter,
+        bias,
+        accum_dtype,
+        validator_fcns=None,
+        error_name=None,
+        qinfo=None,
     ):
         result_tens = OutputShaper.fullyConnectedOp(
-            self.ser, self.rng, ifm, filter, error_name
+            self.ser, self.rng, ifm, filter, accum_dtype, error_name
         )
 
         # Invalidate Input/Output list for error if checks.
@@ -871,17 +944,22 @@
             input_list=input_list,
             output_list=output_list,
             num_operands=num_operands,
+            accum_dtype=accum_dtype,
         ):
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.FullyConnectedAttribute(qinfo[0], qinfo[1])
+        attr.FullyConnectedAttribute(qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
 
-    def build_matmul(self, op, a, b, validator_fcns=None, error_name=None, qinfo=None):
-        result_tens = OutputShaper.matmulOp(self.ser, self.rng, a, b, error_name)
+    def build_matmul(
+        self, op, a, b, accum_dtype, validator_fcns=None, error_name=None, qinfo=None
+    ):
+        result_tens = OutputShaper.matmulOp(
+            self.ser, self.rng, a, b, accum_dtype, error_name
+        )
 
         # Invalidate Input/Output list for error if checks.
         input_list = [a.name, b.name]
@@ -908,11 +986,12 @@
             input_list=input_list,
             output_list=output_list,
             num_operands=num_operands,
+            accum_dtype=accum_dtype,
         ):
             return None
 
         attr = ts.TosaSerializerAttribute()
-        attr.MatMulAttribute(qinfo[0], qinfo[1])
+        attr.MatMulAttribute(qinfo[0], qinfo[1], accum_dtype)
 
         self.ser.addOperator(op["op"], input_list, output_list, attr)
         return result_tens
@@ -995,7 +1074,7 @@
             return None
 
         attr = ts.TosaSerializerAttribute()
-        if a.dtype == DType.FLOAT:
+        if a.dtype in (DType.FP16, DType.FLOAT):
             attr.ClampAttribute(0, 0, min_val, max_val)
         else:
             attr.ClampAttribute(min_val, max_val, 0, 0)
@@ -1811,7 +1890,7 @@
             op["op"], [cond_tens.name, a.name, b.name], [result_tens.name], attr
         )
 
-        if a.dtype in (DType.FLOAT, DType.INT32):
+        if a.dtype in (DType.FLOAT, DType.FP16, DType.INT32):
             then_op, else_op = Op.ADD, Op.SUB
         elif a.dtype in (DType.INT8, DType.INT16):
             then_op, else_op = Op.LOGICAL_RIGHT_SHIFT, Op.LOGICAL_LEFT_SHIFT
@@ -2350,22 +2429,37 @@
     #    if not specified, defaults to (1, 4)
     #  'build_fcn': tuple of the function to (build_operator(), TensorGen function, ArgGen enum)
     #  'types': array of datatypes to be tested
-    TYPE_FP = [DType.FLOAT]
+    TYPE_FP = [DType.FLOAT, DType.FP16]
 
     TYPE_INT = [DType.INT8, DType.INT16, DType.INT32]  # Excludes INT4
-    TYPE_INT_FP = [DType.INT8, DType.INT16, DType.INT32, DType.FLOAT]  # Excludes INT4
+    TYPE_INT_FP = [
+        DType.INT8,
+        DType.INT16,
+        DType.INT32,
+        DType.FP16,
+        DType.FLOAT,
+    ]  # Excludes INT4
 
     TYPE_BOOL = [DType.BOOL]
-    TYPE_FI32 = [DType.FLOAT, DType.INT32]
-    TYPE_FIB = [DType.FLOAT, DType.INT8, DType.INT16, DType.INT32, DType.BOOL]
+    TYPE_FI32 = [DType.FLOAT, DType.FP16, DType.INT32]  # floating-types and INT32
+    TYPE_FIB = [
+        DType.FP16,
+        DType.FLOAT,
+        DType.INT8,
+        DType.INT16,
+        DType.INT32,
+        DType.BOOL,
+    ]
     TYPE_FI16 = [DType.FLOAT, DType.INT16]
 
-    TYPE_NARROW_INT_FP = [DType.INT8, DType.INT16, DType.FLOAT]
+    TYPE_NARROW_INT_FP = [DType.INT8, DType.INT16, DType.FP16, DType.FLOAT]
 
     TYPE_CONV = [
         [DType.INT8, DType.INT4, DType.INT32],
         [DType.INT8, DType.INT8, DType.INT32],
         [DType.INT16, DType.INT8, DType.INT48],
+        [DType.FP16, DType.FP16, DType.FP16],
+        [DType.FP16, DType.FP16, DType.FLOAT],
         DType.FLOAT,
     ]
 
@@ -2524,7 +2618,7 @@
                 build_fully_connected,
                 TosaTensorGen.tgFullyConnected,
                 TosaTensorValuesGen.tvgDefault,
-                None,
+                TosaArgGen.agFullyConnected,
             ),
             "qgen": TosaQuantGen.qgConv,
             "types": TYPE_CONV,
@@ -2546,7 +2640,7 @@
                 build_matmul,
                 TosaTensorGen.tgMatmul,
                 TosaTensorValuesGen.tvgDefault,
-                None,
+                TosaArgGen.agMatMul,
             ),
             "qgen": TosaQuantGen.qgMatmul,
             "types": TYPE_NARROW_INT_FP,
@@ -2564,7 +2658,7 @@
             "operands": (1, 0),
             "rank": (4, 4),
             "build_fcn": (
-                build_pool2d,
+                build_maxpool2d,
                 TosaTensorGen.tgNHWC,
                 TosaTensorValuesGen.tvgDefault,
                 TosaArgGen.agPooling,
@@ -3384,7 +3478,7 @@
                 TosaTensorValuesGen.tvgReduceSum,
                 TosaArgGen.agAxis,
             ),
-            "types": TYPE_FI32,
+            "types": (DType.FP16, DType.FLOAT, DType.INT32),
             "error_if_validators": (
                 TosaErrorValidator.evAxisLargerRank,
                 TosaErrorValidator.evAxisSmallerZero,
@@ -3571,7 +3665,7 @@
                 TosaTensorValuesGen.tvgDefault,
                 None,
             ),
-            "types": TYPE_INT_FP,
+            "types": (DType.INT8, DType.INT16, DType.INT32, DType.FP16, DType.FLOAT),
             "error_if_validators": (
                 TosaErrorValidator.evWrongInputType,
                 TosaErrorValidator.evWrongOutputType,
@@ -3612,7 +3706,7 @@
                 TosaTensorValuesGen.tvgDefault,
                 TosaArgGen.agResize,
             ),
-            "types": [DType.INT8, DType.INT16, DType.FLOAT],
+            "types": (DType.INT8, DType.INT16, DType.FP16, DType.FLOAT),
             "invalid_test_validators": (
                 TosaInvalidValidator.ivWrongDataTypeOrModeResize,
             ),
@@ -3646,7 +3740,14 @@
                 TosaTensorValuesGen.tvgDefault,
                 TosaArgGen.agCast,
             ),
-            "types": [DType.FLOAT, DType.INT8, DType.INT16, DType.INT32, DType.BOOL],
+            "types": (
+                DType.FP16,
+                DType.FLOAT,
+                DType.INT8,
+                DType.INT16,
+                DType.INT32,
+                DType.BOOL,
+            ),
             "error_if_validators": (
                 TosaErrorValidator.evWrongInputType,
                 TosaErrorValidator.evWrongOutputType,
@@ -3925,7 +4026,9 @@
         return ser.addOutput(shape, outputDType)
 
     @staticmethod
-    def conv2dOp(ser, rng, ifm, filter, strides, padding, dilations, error_name=None):
+    def conv2dOp(
+        ser, rng, ifm, filter, accum_dtype, strides, padding, dilations, error_name=None
+    ):
 
         # IFM:    NHWC
         # Filter: OHWI
@@ -3958,26 +4061,26 @@
 
         ofm_shape = [ifm.shape[0], h, w, filter.shape[0]]
 
-        if ifm.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif ifm.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif ifm.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
-        elif error_name == ErrorIf.WrongInputType:
+        if error_name == ErrorIf.WrongInputType:
             # Pick some potentially correct output dtype if input type is incorrect
             out_dtype = DType.INT32
         else:
-            raise Exception(f"Unsupported input dtype: {ifm.dtype}")
+            out_dtype = accum_dtype
 
         if error_name == ErrorIf.WrongOutputType:
-            wrong_dtypes = list(usableDTypes(excludes=[out_dtype]))
+            if ifm.dtype == DType.FP16:
+                excludes = [DType.FP16, DType.FLOAT]
+            else:
+                excludes = [out_dtype]
+            wrong_dtypes = list(usableDTypes(excludes=excludes))
             out_dtype = rng.choice(wrong_dtypes)
 
         return ser.addOutput(ofm_shape, out_dtype)
 
     @staticmethod
-    def conv3dOp(ser, rng, ifm, filter, strides, padding, dilations, error_name=None):
+    def conv3dOp(
+        ser, rng, ifm, filter, accum_dtype, strides, padding, dilations, error_name=None
+    ):
 
         # IFM:    NDHWC
         # Filter: ODHWI
@@ -4020,27 +4123,25 @@
 
         ofm_shape = [ifm.shape[0], d, h, w, filter.shape[0]]
 
-        if ifm.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif ifm.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif ifm.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
-        elif error_name == ErrorIf.WrongInputType:
+        if error_name == ErrorIf.WrongInputType:
             # Pick some potentially correct output dtype if input type is incorrect
             out_dtype = DType.INT32
         else:
-            raise Exception(f"Unsupported input dtype: {ifm.dtype}")
+            out_dtype = accum_dtype
 
         if error_name == ErrorIf.WrongOutputType:
-            wrong_dtypes = list(usableDTypes(excludes=[out_dtype]))
+            if ifm.dtype == DType.FP16:
+                excludes = [DType.FP16, DType.FLOAT]
+            else:
+                excludes = [out_dtype]
+            wrong_dtypes = list(usableDTypes(excludes=excludes))
             out_dtype = rng.choice(wrong_dtypes)
 
         return ser.addOutput(ofm_shape, out_dtype)
 
     @staticmethod
     def depthwiseConv2dOp(
-        ser, rng, ifm, filter, strides, padding, dilations, error_name=None
+        ser, rng, ifm, filter, accum_dtype, strides, padding, dilations, error_name=None
     ):
         # IFM:    NHWC
         # Filter: HWCM
@@ -4073,20 +4174,18 @@
 
         ofm_shape = [ifm.shape[0], h, w, filter.shape[2] * filter.shape[3]]
 
-        if ifm.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif ifm.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif ifm.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
-        elif error_name == ErrorIf.WrongInputType:
+        if error_name == ErrorIf.WrongInputType:
             # Pick some potentially correct output dtype if input type is incorrect
             out_dtype = DType.INT32
         else:
-            raise Exception(f"Unsupported input dtype: {ifm.dtype}")
+            out_dtype = accum_dtype
 
         if error_name == ErrorIf.WrongOutputType:
-            wrong_dtypes = list(usableDTypes(excludes=[out_dtype]))
+            if ifm.dtype == DType.FP16:
+                excludes = [DType.FP16, DType.FLOAT]
+            else:
+                excludes = [out_dtype]
+            wrong_dtypes = list(usableDTypes(excludes=excludes))
             out_dtype = rng.choice(wrong_dtypes)
 
         return ser.addOutput(ofm_shape, out_dtype)
@@ -4119,6 +4218,7 @@
                 DType.INT32,
                 DType.INT48,
                 DType.FLOAT,
+                DType.FP16,
             ]
             wrong_dtypes = list(set(all_dtypes) - set([ifm.dtype]))
             outputDType = rng.choice(wrong_dtypes)
@@ -4128,55 +4228,20 @@
         return ser.addOutput(ofm_shape, outputDType)
 
     @staticmethod
-    def fullyConnectedOp(ser, rng, input, filter, error_name=None):
+    def fullyConnectedOp(ser, rng, input, filter, accum_dtype, error_name=None):
         # input: N, IC
         # filter: OC, IC
         # output: N, OC
 
         output_shape = [input.shape[0], filter.shape[0]]
 
-        if error_name == ErrorIf.WrongOutputType:
-            if input.dtype == DType.INT8:
-                incorrect_types = (
-                    DType.INT4,
-                    DType.INT8,
-                    DType.INT16,
-                    DType.INT48,
-                    DType.FLOAT,
-                )
-            elif input.dtype == DType.INT16:
-                incorrect_types = (
-                    DType.INT4,
-                    DType.INT8,
-                    DType.INT16,
-                    DType.INT32,
-                    DType.FLOAT,
-                )
-            elif input.dtype == DType.FLOAT:
-                incorrect_types = (
-                    DType.INT4,
-                    DType.INT8,
-                    DType.INT16,
-                    DType.INT32,
-                    DType.INT48,
-                )
-            out_dtype = rng.choice(a=incorrect_types)
-        elif input.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif input.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif input.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
-        elif error_name == ErrorIf.WrongInputType:
-            # Pick some potentially correct output dtype if input type is incorrect
-            out_dtype = DType.INT32
-        else:
-            raise Exception("Unsupported input dtype: {}".format(input.dtype))
+        # Validated in arg_gen (also invalidated for ErrorIf)
+        out_dtype = accum_dtype
 
         return ser.addOutput(output_shape, out_dtype)
 
     @staticmethod
-    def matmulOp(ser, rng, a, b, error_name=None):
+    def matmulOp(ser, rng, a, b, accum_dtype, error_name=None):
         # a: N, H, C
         # b: N, C, W
         # out: N, H, W
@@ -4200,7 +4265,7 @@
                     DType.INT32,
                     DType.FLOAT,
                 )
-            elif a.dtype == DType.FLOAT:
+            elif a.dtype == DType.FLOAT or a.dtype == DType.FP16:
                 incorrect_types = (
                     DType.INT4,
                     DType.INT8,
@@ -4209,17 +4274,11 @@
                     DType.INT48,
                 )
             out_dtype = rng.choice(a=incorrect_types)
-        elif a.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif a.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif a.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
         elif error_name == ErrorIf.WrongInputType:
             # Pick some potentially correct output dtype if input type is incorrect
             out_dtype = DType.INT32
         else:
-            raise Exception("Unsupported input dtype for matmul: {}".format(a.dtype))
+            out_dtype = accum_dtype  # Validated in arg_gen
 
         return ser.addOutput(output_shape, out_dtype)
 
@@ -4269,10 +4328,6 @@
             bad_dim = rng.choice(range(len(output_shape)))
             output_shape[bad_dim] -= rng.choice([1, 2])
 
-        # Fix negative output shape if error_if test causes it
-        if error_name == ErrorIf.PadSmallerZero and min(output_shape) < 1:
-            output_shape = [i if i >= 1 else 1 for i in output_shape]
-
         if error_name == ErrorIf.WrongOutputType:
             all_dtypes = [
                 DType.INT8,
@@ -4280,6 +4335,7 @@
                 DType.INT32,
                 DType.INT48,
                 DType.FLOAT,
+                DType.FP16,
             ]
             wrong_dtypes = list(set(all_dtypes) - set([a.dtype]))
             outputDType = rng.choice(wrong_dtypes)
@@ -4546,7 +4602,7 @@
         return ser.addOutput(val.shape, out_dtype)
 
     @staticmethod
-    def transposeConv2DOp(ser, rng, ifm, output_shape, error_name=None):
+    def transposeConv2DOp(ser, rng, ifm, output_shape, accum_dtype, error_name=None):
         if error_name == ErrorIf.ConvOutputShapeMismatch:
             choices = [1, 2, 3]
             change = rng.choice(choices)
@@ -4555,20 +4611,18 @@
             if change in [2, 3]:
                 output_shape[2] = output_shape[2] + rng.choice(choices)
 
-        if ifm.dtype == DType.INT8:
-            out_dtype = DType.INT32
-        elif ifm.dtype == DType.INT16:
-            out_dtype = DType.INT48
-        elif ifm.dtype == DType.FLOAT:
-            out_dtype = DType.FLOAT
-        elif error_name == ErrorIf.WrongInputType:
+        if error_name == ErrorIf.WrongInputType:
             # Pick some potentially correct output dtype if input type is incorrect
             out_dtype = DType.INT32
         else:
-            raise Exception(f"Unsupported input dtype: {ifm.dtype}")
+            out_dtype = accum_dtype
 
         if error_name == ErrorIf.WrongOutputType:
-            wrong_dtypes = list(usableDTypes(excludes=[out_dtype]))
+            if ifm.dtype == DType.FP16:
+                excludes = [DType.FP16, DType.FLOAT]
+            else:
+                excludes = [out_dtype]
+            wrong_dtypes = list(usableDTypes(excludes=excludes))
             out_dtype = rng.choice(wrong_dtypes)
 
         return ser.addOutput(output_shape, out_dtype)
diff --git a/verif/generator/tosa_utils.py b/verif/generator/tosa_utils.py
index 6a689d0..7fa31e7 100644
--- a/verif/generator/tosa_utils.py
+++ b/verif/generator/tosa_utils.py
@@ -84,3 +84,42 @@
     for n in shape:
         value *= n
     return value
+
+
+def get_accum_dtype_from_tgTypes(dtypes):
+    # Get accumulate data-type from the test generator's defined types
+    if isinstance(dtypes, list) or isinstance(dtypes, tuple):
+        return dtypes[-1]
+    else:
+        return dtypes
+
+
+def get_wrong_output_type(op_name, rng, input_dtype):
+    if op_name == "fully_connected" or op_name == "matmul":
+        if input_dtype == DType.INT8:
+            incorrect_types = (
+                DType.INT4,
+                DType.INT8,
+                DType.INT16,
+                DType.INT48,
+                DType.FLOAT,
+                DType.FP16,
+            )
+        elif input_dtype == DType.INT16:
+            incorrect_types = (
+                DType.INT4,
+                DType.INT8,
+                DType.INT16,
+                DType.INT32,
+                DType.FLOAT,
+                DType.FP16,
+            )
+        elif input_dtype == DType.FLOAT or input_dtype == DType.FP16:
+            incorrect_types = (
+                DType.INT4,
+                DType.INT8,
+                DType.INT16,
+                DType.INT32,
+                DType.INT48,
+            )
+    return rng.choice(a=incorrect_types)
diff --git a/verif/tests/test_tosa_result_checker.py b/verif/tests/test_tosa_result_checker.py
index efee23b..d78d158 100644
--- a/verif/tests/test_tosa_result_checker.py
+++ b/verif/tests/test_tosa_result_checker.py
@@ -40,7 +40,7 @@
         (np.uint16, trc.TestResult.MISMATCH),
         (np.uint32, trc.TestResult.MISMATCH),
         (np.uint64, trc.TestResult.MISMATCH),
-        (np.float16, trc.TestResult.MISMATCH),
+        (np.float16, trc.TestResult.PASS),
         (np.float32, trc.TestResult.PASS),
         (np.float64, trc.TestResult.MISMATCH),
         (bool, trc.TestResult.PASS),