TOSA: Support for TABLE operator (int8)

Added support to map TABLE operator to LUT.
Limitations:
-Only supported for int8
-TABLE input must be constant

This also adds the support for TFLite legalisation of
Tanh/Sigmoid (int8/uint8).

Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I1a95f61fb02fdd42c4a690494418cc0765c8b275
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index dafd284..d2d3d83 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 
+from . import lut
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
@@ -26,6 +27,8 @@
 from .operation import Op
 from .operation_util import create_avgpool_nop
 from .shape4d import Shape4D
+from .tensor import create_const_tensor
+from .tensor import QuantizationParameters
 
 memory_only_ops = (
     Op.Reshape,
@@ -320,3 +323,31 @@
             )
         DebugDatabase.add_optimised(op, op)
     return op
+
+
+def convert_to_lut(op, lut_values, lut_name):
+    # Rewrite the operation by Add with scalar 0 + LUT activation
+    ifm = op.inputs[0]
+    if ifm is None:
+        return op
+    assert ifm.dtype.size_in_bytes() == 1
+    op.type = Op.Add
+    op.name = op.name + "_lut_" + lut_name
+    # Mark as no-op to enable potential fusing optimizations
+    op.attrs["is_nop"] = True
+    # Create an input tensor containing scalar zero
+    quantization = QuantizationParameters(0.0, 255.0)
+    quantization.scale_f32 = ifm.quantization.scale_f32
+    quantization.zero_point = 0
+    tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
+    op.add_input_tensor(tens)
+    op.ifm_shapes.append(Shape4D(tens.shape))  # TODO no shape?
+
+    # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+    # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+    # should be the same as the IFM
+    op.forced_output_quantization = ifm.quantization
+    lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
+    op.set_activation_lut(lut_tensor)
+    op.set_ifm_ofm_shapes()
+    return op
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index e9d364e..1558b94 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -281,6 +281,7 @@
     SubgraphInput = OperatorInfo()  # Only used in CPU subgraphs
     Sum = OperatorInfo()
     Svdf = OperatorInfo()
+    Table = OperatorInfo(indices=NNG_IFM_INDICES)
     Tanh = OperatorInfo(indices=NNG_IFM_INDICES)
     Tile = OperatorInfo()
     TopKV2 = OperatorInfo()
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index b48cc7a..cf211de 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -22,7 +22,6 @@
 import numpy as np
 
 from . import fp_math
-from . import lut
 from . import rewrite_graph
 from . import scaling
 from .api import NpuRoundingMode
@@ -33,6 +32,7 @@
 from .graph_optimiser_util import bypass_memory_only_ops
 from .graph_optimiser_util import calc_explicit_padding
 from .graph_optimiser_util import convert_depthwise_to_conv
+from .graph_optimiser_util import convert_to_lut
 from .graph_optimiser_util import fix_sg_input_output
 from .graph_optimiser_util import memory_only_ops
 from .graph_optimiser_util import move_splitsliceread_to_consumer
@@ -858,34 +858,6 @@
     return op
 
 
-def convert_to_lut(op, lut_values, lut_name):
-    # Rewrite the operation by Add with scalar 0 + LUT activation
-    ifm = op.inputs[0]
-    if ifm is None:
-        return op
-    assert ifm.dtype.size_in_bytes() == 1
-    op.type = Op.Add
-    op.name = op.name + "_lut_" + lut_name
-    # Mark as no-op to enable potential fusing optimizations
-    op.attrs["is_nop"] = True
-    # Create an input tensor containing scalar zero
-    quantization = QuantizationParameters(0.0, 255.0)
-    quantization.scale_f32 = ifm.quantization.scale_f32
-    quantization.zero_point = 0
-    tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
-    op.add_input_tensor(tens)
-    op.ifm_shapes.append(Shape4D(tens.shape))
-
-    # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
-    # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
-    # should be the same as the IFM
-    op.forced_output_quantization = ifm.quantization
-    lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
-    op.set_activation_lut(lut_tensor)
-    op.set_ifm_ofm_shapes()
-    return op
-
-
 def convert_to_lut8(op, fn, fn_name):
     # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
     # fn is a function(real) -> real
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index a298ddb..1ef0444 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -24,6 +24,7 @@
 from .graph_optimiser_util import bypass_memory_only_ops
 from .graph_optimiser_util import calc_explicit_padding
 from .graph_optimiser_util import convert_depthwise_to_conv
+from .graph_optimiser_util import convert_to_lut
 from .graph_optimiser_util import move_splitsliceread_to_consumer
 from .graph_optimiser_util import needed_total_padding
 from .graph_optimiser_util import set_ifm_ofm_op_shapes
@@ -490,13 +491,26 @@
     return add_op
 
 
+def convert_table_to_lut(op, arch, nng):
+    # Converts table op to a no-op + LUT
+    if op.type is not Op.Table:
+        return op
+
+    table = op.inputs[1]
+    op.inputs.remove(table)
+    op.set_ifm_ofm_shapes()
+
+    return convert_to_lut(op, table.values, "table")
+
+
 def fixup_quantization(op, arch, nng):
     if op.ifm and op.ifm.quantization.zero_point is None:
         op.ifm.quantization.zero_point = 0
     if op.ifm2 and op.ifm2.quantization.zero_point is None:
-        op.ifm.quantization.zero_point = 0
-    if op.ofm and op.ofm.quantization.zero_point is None:
-        op.ofm.quantization.zero_point = 0
+        op.ifm2.quantization.zero_point = 0
+    if not op.forced_output_quantization:
+        if op.ofm and op.ofm.quantization and op.ofm.quantization.zero_point is None:
+            op.ofm.quantization.zero_point = 0
     return op
 
 
@@ -547,7 +561,7 @@
         )
 
     # Rewite Operators step
-    op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv]
+    op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv, convert_table_to_lut]
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py
index ebbaa0a..f80a915 100644
--- a/ethosu/vela/tosa_mapping.py
+++ b/ethosu/vela/tosa_mapping.py
@@ -196,7 +196,6 @@
     TosaOp.MAXIMUM,
     TosaOp.MINIMUM,
     TosaOp.POW,
-    TosaOp.TABLE,
     TosaOp.ABS,
     TosaOp.BITWISE_NOT,
     TosaOp.CEIL,
@@ -274,7 +273,8 @@
     TosaOp.MUL: (Op.Mul, mul_attrs, None, TOSA_IFM_IFM2_INDICES),
     # TODO TosaOp.POW
     TosaOp.SUB: (Op.Sub, None, None, TOSA_IFM_IFM2_INDICES),
-    # TODO TosaOp.TABLE
+    # TODO is table content in input[1] always constant?
+    TosaOp.TABLE: (Op.Table, None, None, TOSA_IFM_INDICES),
     # TODO TosaOp.ABS
     # TODO TosaOp.BITWISE_NOT
     # TODO TosaOp.CEIL
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index a4f822e..98df27e 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -42,7 +42,7 @@
     binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.RescaleMul, Op.Sub,))
     type_conversion_ops = set((Op.Rescale,))
     relu_ops = set((Op.Clamp, Op.ReluN,))
-    activation_ops = relu_ops
+    activation_ops = relu_ops | set((Op.Table,))
     pad_ops = set((Op.Pad,))
 
     npu_post_ops = activation_ops
@@ -68,6 +68,8 @@
 
         self.specific_constraints[Op.Transpose].append(TosaSupportedOperators.constraint_ifm_producer)
         self.specific_constraints[Op.Pad].append(TosaSupportedOperators.constraint_padding_producer)
+        self.specific_constraints[Op.Table].append(TosaSupportedOperators.constraint_table_dtype)
+        self.specific_constraints[Op.Table].append(TosaSupportedOperators.constraint_table_producer)
 
         # Depthwise Conv specific checks:
         for op_type in TosaSupportedOperators.depthwise_convolution_ops:
@@ -200,3 +202,23 @@
             )
             return valid, extra
         return True, "Op has depth_multiplier=1"
+
+    # TODO Table operator support limited to int8 for now.
+    # For TFLite it is assumed to be constant.
+    @staticmethod
+    def constraint_table_dtype(op):
+        "Only supported is int8"
+        valid = True
+        tensors = [op.ifm, op.ofm, op.inputs[1]]
+        for tens in tensors:
+            if tens.dtype != DataType.int8:
+                valid = False
+        return valid, "Table operator with non int8 tensor"
+
+    # TODO limit table to be constant data for now.
+    # Can it be non-constant?
+    @staticmethod
+    def constraint_table_producer(op):
+        "Input must be constant data"
+        valid = op.inputs[1].ops and op.inputs[1].ops[0].type == Op.Const
+        return valid, "Table Op with non-constant table input"