MLBEDSW-8042: MLCE: Add SQUARED_DIFFERENCE support

- Added SQUARED_DIFFERENCE support
- Updated SUPPORTED_OPS.md

Change-Id: Id83d9d92129e645390c7979759dfdeff7a14c2ee
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 0fef738..0d60c67 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@
 # Supported Ops
 
 This file was automatically generated by Vela using the `--supported-ops-report` parameter.  
-Vela version: `3.9.1.dev2+gc02eaa3.d20230904`
+Vela version: `3.9.1.dev7+g3a3f35e.d20230912`
 
 This file complies with
 [**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md)
@@ -70,6 +70,7 @@
 | SOFTMAX | [Generic](#tflite-generic-constraints), [Specific](#tflite-softmax-constraints) |
 | SPLIT | [Generic](#tflite-generic-constraints), [Specific](#tflite-split-constraints) |
 | SPLIT_V | [Generic](#tflite-generic-constraints), [Specific](#tflite-split_v-constraints) |
+| SQUARED_DIFFERENCE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squared_difference-constraints) |
 | SQUEEZE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squeeze-constraints) |
 | STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) |
 | SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) |
@@ -367,6 +368,12 @@
 
 - Only one size is allowed to be inferred
 
+### TFLite SQUARED_DIFFERENCE Constraints
+
+This is a list of constraints that the SQUARED_DIFFERENCE operator must satisfy in order to be scheduled on the NPU.
+
+- At least one Input's shape must match the OFM's shape
+
 ### TFLite SQUEEZE Constraints
 
 This is a list of constraints that the SQUEEZE operator must satisfy in order to be scheduled on the NPU.
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 94d256c..c9a30b2 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -286,7 +286,7 @@
     SplitV = OperatorInfo(indices=NNG_IFM_INDICES)
     Sqrt = OperatorInfo()
     Square = OperatorInfo()
-    SquaredDifference = OperatorInfo()
+    SquaredDifference = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
     Squeeze = OperatorInfo(indices=NNG_IFM_INDICES)
     StridedSlice = OperatorInfo(indices=NNG_IFM_INDICES)
     Sub = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py
index ef4949f..44a80b2 100644
--- a/ethosu/vela/operation_util.py
+++ b/ethosu/vela/operation_util.py
@@ -98,7 +98,8 @@
 
     c = ifm.shape[-1]
 
-    shape = [1, 1, 1, c]
+    # Weigth shape is in format [h, w, c, b]
+    shape = [1, 1, c, 1]
     kernel = np.dstack([1] * c)
     identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
     op.add_input_tensor(
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 2fb75e6..794a6ec 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -1986,6 +1986,115 @@
         return check_asymmetric_weights
 
 
+def convert_squared_difference(op, arch, nng):
+    if op.type == Op.SquaredDifference and op.run_on_npu:
+        ifm, ifm2, ofm = op.get_ifm_ifm2_ofm()
+
+        identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
+
+        # All the calculations/parameters same as reference kernel
+        twice_max_input_scale = np.double(2.0 * max(ifm.quantization.scale_f32, ifm2.quantization.scale_f32))
+        real_input1_multiplier = np.double(ifm.quantization.scale_f32) / twice_max_input_scale
+        real_input2_multiplier = np.double(ifm2.quantization.scale_f32) / twice_max_input_scale
+
+        left_shift = 0 if op.ifm.dtype == DataType.int16 else 7
+
+        real_output_multiplier = (twice_max_input_scale * twice_max_input_scale) / (
+            np.double((1 << (left_shift * 2)) * ofm.quantization.scale_f32)
+        )
+
+        input1_multiplier, input1_shift = quantise_scale(real_input1_multiplier)
+        input2_multiplier, input2_shift = quantise_scale(real_input2_multiplier)
+        output_multiplier, output_shift = quantise_scale(real_output_multiplier)
+
+        input1_multiplier_const = create_const_tensor(
+            op.name + "_input1_multiplier", [1], DataType.int32, [input1_multiplier], quantization=identity_quant
+        )
+        input2_multiplier_const = create_const_tensor(
+            op.name + "_input2_multiplier", [1], DataType.int32, [input2_multiplier], quantization=identity_quant
+        )
+        output_multiplier_const = create_const_tensor(
+            op.name + "_output_multiplier", [1], DataType.int32, [output_multiplier], quantization=identity_quant
+        )
+
+        # Convert ifm to 32 bit
+        ifm_32bit_shifted = ifm.clone(suffix="_ifm_32bit_shifted", set_unique=True)
+        ifm_32bit_shifted.dtype = DataType.int32
+        ifm_32bit_shifted.quantization = identity_quant
+        cast_op = create_cast_op(op.name + "_ifm_32bit_shifted", ifm, ifm_32bit_shifted)
+        # Use explicit scaling (multiplier) for the left shift
+        cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift])
+        DebugDatabase.add_optimised(op, cast_op)
+
+        # 32 bit Mul op do not scale the value so the input has to be multiplied with the "multiplier" calculated above
+        ifm_scaled = ifm.clone(suffix="_scaled", set_unique=True)
+        ifm_scaled.dtype = DataType.int32
+        ifm_scaled.quantization = identity_quant
+        mul_op = Operation(Op.Mul, op.name + "_scaled_input1")
+        mul_op.add_input_tensor(ifm_32bit_shifted)
+        mul_op.add_input_tensor(input1_multiplier_const)
+        mul_op.set_output_tensor(ifm_scaled)
+        # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+        mul_op.explicit_scaling = ExplicitScaling(False, [input1_shift], [input1_multiplier])
+        mul_op.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, mul_op)
+
+        # Convert ifm2 to 32 bit
+        ifm2_32bit_shifted = ifm2.clone(suffix="_ifm2_32bit_shifted", set_unique=True)
+        ifm2_32bit_shifted.dtype = DataType.int32
+        ifm2_32bit_shifted.quantization = identity_quant
+        cast_op = create_cast_op(op.name + "_ifm2_32bit_shifted", ifm2, ifm2_32bit_shifted)
+        # Use explicit scaling (multiplier) for the left shift
+        cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift])
+        DebugDatabase.add_optimised(op, cast_op)
+
+        # 32 bit Mul op do not scale the value so input has to be multiplied with the "multiplier" calculated above
+        ifm2_scaled = ifm2.clone(suffix="_scaled", set_unique=True)
+        ifm2_scaled.dtype = DataType.int32
+        ifm2_scaled.quantization = identity_quant
+        mul_op = Operation(Op.Mul, op.name + "_scaled_input2")
+        mul_op.add_input_tensor(ifm2_32bit_shifted)
+        mul_op.add_input_tensor(input2_multiplier_const)
+        mul_op.set_output_tensor(ifm2_scaled)
+        # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+        mul_op.explicit_scaling = ExplicitScaling(False, [input2_shift], [input2_multiplier])
+        mul_op.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, mul_op)
+
+        # Calculate the raw diff
+        raw_diff = ifm.clone(suffix="_raw_diff", set_unique=True)
+        raw_diff.dtype = DataType.int32
+        raw_diff.quantization = None
+        sub_op = Operation(Op.Sub, op.name + "_raw_diff")
+        sub_op.add_input_tensor(ifm_scaled)
+        sub_op.add_input_tensor(ifm2_scaled)
+        sub_op.set_output_tensor(raw_diff)
+        sub_op.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, sub_op)
+
+        # Calculate the squared diff
+        squared_raw = ifm.clone(suffix="_squared_raw", set_unique=True)
+        squared_raw.dtype = DataType.int32
+        squared_raw.quantization = None
+        mul_op = Operation(Op.Mul, op.name + "_squared_raw")
+        mul_op.add_input_tensor(raw_diff)
+        mul_op.add_input_tensor(raw_diff)
+        mul_op.set_output_tensor(squared_raw)
+        mul_op.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, mul_op)
+
+        # 32 bit Mul op do not scale the value so output has to be multiplied with "multiplier" calculated above
+        op.set_input_tensor(squared_raw, 0)
+        op.set_input_tensor(output_multiplier_const, 1)
+        op.type = Op.Mul
+        # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+        op.explicit_scaling = ExplicitScaling(False, [output_shift], [output_multiplier])
+        op.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, op)
+
+    return op
+
+
 def convert_mean_to_depthwise_conv(op, arch, nng):
     """
     When h x w <= 4096     When h x w > 4096 there is a need to split into several ops.
@@ -2669,6 +2778,7 @@
     op_rewrite_list = [
         set_tensor_equivalence,
         convert_ops_to_lut,
+        convert_squared_difference,
         convert_mean_to_depthwise_conv,
         convert_depthwise_to_conv,
         convert_conv_to_fc,
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index 647430e..b1e0eae 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -848,7 +848,7 @@
     BuiltinOperator.SQUARED_DIFFERENCE: (
         Op.SquaredDifference,
         OptionsSerializer("SquaredDifferenceOptions"),
-        TFLITE_NO_INDICES,
+        TFLITE_IFM_IFM2_INDICES,
     ),
     BuiltinOperator.MIRROR_PAD: (Op.MirrorPad, OptionsSerializer("MirrorPadOptions", ("mode",)), TFLITE_NO_INDICES),
     BuiltinOperator.ABS: (Op.Abs, OptionsSerializer("AbsOptions"), TFLITE_IFM_INDICES),
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index d2e0ba5..258af93 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -76,7 +76,7 @@
         )
     )
     binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
-    elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+    elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
     shapeless_input_ops = binary_elem_wise_main_ops | set(
         (Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims, Op.Quantize, Op.ArgMax)
     )
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 52b0485..3dbde84 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -106,7 +106,7 @@
         )
     )
     binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
-    elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+    elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
     pad_ops = set((Op.Pad,))
     supported_int32_tensor_ops = (
         set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops