MLBEDSW-839: Code generation using external API

Added external API to generate register command streams.

Existing code generation has been refactored to make
use of this API.

Change-Id: Ibb4c2b167809869f16470b14da24f08a65c82b7b
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
new file mode 100644
index 0000000..7750121
--- /dev/null
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -0,0 +1,497 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Conversion from high level command to NpuOperation
+from enum import IntEnum
+from typing import List
+from typing import Optional
+
+from .api import NpuActivation
+from .api import NpuActivationOp
+from .api import NpuAddressRange
+from .api import NpuBlockOperation
+from .api import NpuBlockTraversal
+from .api import NpuConv2DOperation
+from .api import NpuConvDepthWiseOperation
+from .api import NpuDataType
+from .api import NpuDmaOperation
+from .api import NpuElementWiseOp
+from .api import NpuElementWiseOperation
+from .api import NpuFeatureMap
+from .api import NpuKernel
+from .api import NpuLayout
+from .api import NpuOperation
+from .api import NpuPadding
+from .api import NpuPoolingOp
+from .api import NpuPoolingOperation
+from .api import NpuQuantization
+from .api import NpuResamplingMode
+from .api import NpuRoundingMode
+from .api import NpuShape3D
+from .api import NpuTileBox
+from .architecture_features import ArchitectureFeatures
+from .data_type import DataType
+from .high_level_command_stream import Box
+from .high_level_command_stream import Command
+from .high_level_command_stream import CommandType
+from .high_level_command_stream import DMA
+from .high_level_command_stream import NpuStripe
+from .operation import Kernel
+from .operation import NpuBlockType
+from .operation import Op
+from .operation import Operation
+from .tensor import MemType
+from .tensor import Tensor
+from .tensor import TensorBlockTraversal
+from .tensor import TensorFormat
+from .tensor import TensorPurpose
+
+
+unary_elementwise_ops = set((NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ,))
+
+
+class BasePointerIndex(IntEnum):
+    WeightTensor = 0  # base address index for the Weight tensor
+    ScratchTensor = 1  # base address index for the Scratch_tensor in the TensorArena
+    ScratchFastTensor = 2  # base address for the Scratch_fast_tensor
+    Mem2Mem = (1 << 8) | (3 << 0)  # base address slot for memory 2 memory transfer
+
+
+dtype_map = {
+    DataType.uint8: NpuDataType.UINT8,
+    DataType.int8: NpuDataType.INT8,
+    DataType.uint16: NpuDataType.UINT16,
+    DataType.int16: NpuDataType.INT16,
+    DataType.int32: NpuDataType.INT32,
+}
+
+
+block_traversal_map = {
+    TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST,
+    TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST,
+}
+
+
+# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
+elementwise_op_map = {
+    Op.Mul: NpuElementWiseOp.MUL,
+    Op.Add: NpuElementWiseOp.ADD,
+    Op.Sub: NpuElementWiseOp.SUB,
+    Op.Minimum: NpuElementWiseOp.MIN,
+    Op.Maximum: NpuElementWiseOp.MAX,
+    Op.LeakyRelu: NpuElementWiseOp.LRELU,
+    Op.Abs: NpuElementWiseOp.ABS,
+    Op.CLZ: NpuElementWiseOp.CLZ,
+    Op.SHR: NpuElementWiseOp.SHR,
+    Op.SHL: NpuElementWiseOp.SHL,
+}
+
+
+def to_npu_kernel(kernel: Kernel) -> NpuKernel:
+    """Converts the given internally used kernel object to NpuKernel (of public API)"""
+    return NpuKernel(
+        kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
+    )
+
+
+def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
+    """Converts the given public API object to Kernel (used internally)"""
+    if kernel is None:
+        return Kernel(1, 1)
+    return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
+
+
+def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
+    if ifm_shape == []:
+        # Scalar needs to be in IFM2
+        return False
+    if ifm2_shape == []:
+        return True
+
+    for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
+        if ifm != ifm2 and ifm == 1:
+            # Broadcasted FM needs to be in IFM2
+            return False
+    return True
+
+
+def get_rounding_mode(op: Operation) -> NpuRoundingMode:
+    """Specifies type of rounding to be used"""
+    rounding_mode = NpuRoundingMode.TFL
+    if op.type == Op.ResizeBilinear:
+        rounding_mode = NpuRoundingMode.TRUNCATE
+    elif (
+        op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
+        and op.ifm.dtype == DataType.int16
+    ):
+        rounding_mode = NpuRoundingMode.NATURAL
+    elif op.type.is_avgpool_op() and op.memory_function == Op.ConcatSliceWrite and op.kernel.elements_wh() == 1:
+        rounding_mode = NpuRoundingMode.NATURAL
+    rounding_mode = op.attrs.get("rounding_mode", rounding_mode)
+    return rounding_mode
+
+
+def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
+    if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
+        return NpuPadding(top=0, left=0, bottom=0, right=0)
+    explicit_padding = list(primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)
+
+    # Check if this is for horizontal ifm streaming
+    if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+        explicit_padding[0] = cmd.pad_top
+        explicit_padding[2] = cmd.pad_bottom
+
+    # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
+    # because of activation function needed to be fused.
+    if cmd.ifm_box.start_coord[-2] > 0:
+        explicit_padding[1] = 0
+    if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
+        explicit_padding[3] = 0
+    return NpuPadding(
+        top=explicit_padding[0], left=explicit_padding[1], bottom=explicit_padding[2], right=explicit_padding[3]
+    )
+
+
+def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
+    if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+        base_ptr_idx_map = {
+            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+            MemType.Scratch: BasePointerIndex.ScratchTensor,
+            MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
+        }
+    else:
+        base_ptr_idx_map = {
+            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+            MemType.Scratch: BasePointerIndex.ScratchTensor,
+            MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
+        }
+    return int(base_ptr_idx_map[tens.mem_type])
+
+
+def get_upscale(op: Operation) -> NpuResamplingMode:
+    upscale = NpuResamplingMode.NONE
+    if op.type == Op.ResizeBilinear:
+        # perform nearest neighbor upscale
+        upscale = NpuResamplingMode.NEAREST
+    elif op.type == Op.Conv2DBackpropInputSwitchedBias:
+        # perform insert zero upscale
+        upscale = NpuResamplingMode.TRANSPOSE
+    return upscale
+
+
+def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
+    if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+        shape = ifm_box.get_size_shape()
+    else:
+        shape = ofm_box.get_size_shape()
+    return shape[-1]
+
+
+def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
+    """Checks if quantization should use 0 as zero point"""
+    if tens.dtype == DataType.int32 and is_ifm_tensor:
+        return True
+    if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
+        return False
+    fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
+    forced_ofm_quantization = ps.primary_op.forced_output_quantization
+    use_0 = (
+        (ps.primary_op.activation is None or forced_ofm_quantization is not None)
+        and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
+        and not fused_quantize
+    )
+    return use_0
+
+
+def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
+    """Gets quantization for IFM/IFM2"""
+    if tens.quantization is None:
+        return None
+    if use_zero_point_0(ps, tens, True):
+        zero_point = 0
+    else:
+        zero_point = int(tens.quantization.zero_point)
+    return NpuQuantization(scale_f32=tens.quantization.scale_f32, zero_point=zero_point)
+
+
+def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
+    """Gets quantization for OFM"""
+    op = ps.primary_op
+    # Check if operation's output quantization is should be used instead of the output tensor's quantization
+    # (used in LUTs)
+    ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
+    if ofm_quant is None:
+        return None
+    if use_zero_point_0(ps, tens, False):
+        zero_point = 0
+    else:
+        zero_point = int(ofm_quant.zero_point)
+    return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
+
+
+def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures) -> NpuFeatureMap:
+    """Creates feature map with common fields populated"""
+    fm = NpuFeatureMap()
+    fm.region = get_region(tens, arch)
+    fm.data_type = dtype_map[tens.dtype]
+    if tens.format == TensorFormat.NHWC:
+        fm.layout = NpuLayout.NHWC
+    elif tens.format == TensorFormat.NHCWB16:
+        fm.layout = NpuLayout.NHCWB16
+    else:
+        assert 0, "Incorrect tensor format"
+    height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(box.start_coord, box.end_coord)
+    for idx, addr in enumerate(addresses):
+        if addr is None:
+            addresses[idx] = 0
+    fm.tiles = NpuTileBox(
+        height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
+    )
+    strides = tens.get_strides()
+    fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
+    return fm
+
+
+def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]:
+    """Returns address ranges for weights"""
+    weights = []
+    stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
+    weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index]
+    substreams = len(weight_substream_offsets) - 1  # Offset list must terminate with full stream length
+
+    # Extract weight substream offsets and calculate their lengths
+    assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
+    weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
+    region = get_region(weight_tensor, arch)
+    for core in range(substreams):
+        address = weight_addr + weight_substream_offsets[core]
+        length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
+        addr_range = NpuAddressRange(region, int(address), int(length))
+        weights.append(addr_range)
+    return weights
+
+
+def create_biases(
+    weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures
+) -> List[NpuAddressRange]:
+    """Returns address ranges for biases"""
+    biases = []
+    stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
+    scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index]
+    substreams = len(scale_substream_offsets) - 1  # Offset list must terminate with full stream length
+
+    # Extract scale substream offsets and calculate their lengths
+    assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
+    scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
+
+    region = get_region(scale_tensor, arch)
+    for core in range(substreams):
+        address = scale_addr + scale_substream_offsets[core]
+        length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
+        addr_range = NpuAddressRange(region, int(address), int(length))
+        biases.append(addr_range)
+    return biases
+
+
+def create_npu_activation(op: Operation) -> NpuActivation:
+    """Creates fused activation function"""
+    if op.activation is None:
+        return NpuActivation(NpuActivationOp.NONE_OR_RELU)
+    faf = op.activation.op_type
+    act_op = NpuActivationOp.NONE_OR_RELU
+    if faf == Op.Tanh:
+        act_op = NpuActivationOp.TANH
+    elif faf == Op.Sigmoid:
+        act_op = NpuActivationOp.SIGMOID
+    elif faf == Op.LUT:
+        act_op = NpuActivationOp.TABLE_LOOKUP
+    elif not faf.is_relu_op():
+        raise Exception("Unsupported fused_activation_function = " + faf.name)
+
+    act = NpuActivation(act_op)
+    act.min = op.activation.min
+    act.max = op.activation.max
+    act.lookup_table_index = op.activation.lut_index
+    return act
+
+
+def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
+    """Sets common fields of the given operation"""
+    ps = cmd.ps
+    op = ps.primary_op
+    in_shape = cmd.ifm_box.get_size_shape()
+    out_shape = cmd.ofm_box.get_size_shape()
+    ofm_height = out_shape[-3] if len(out_shape) >= 4 else 1
+    ofm_width = out_shape[-2] if len(out_shape) >= 2 else 1
+    ofm_depth = out_shape[-1] if len(out_shape) >= 1 else 1
+    ifm_height = in_shape[-3] if len(in_shape) >= 4 else 1
+    if op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+        ifm_depth = in_shape[-1] if len(in_shape) >= 1 else 1
+    else:
+        ifm_depth = ofm_depth
+
+    npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch)
+    npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=cmd.ifm_tensor.shape[-2], depth=ifm_depth)
+    npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
+    npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch)
+    npu_op.ofm.shape = NpuShape3D(height=ofm_height, width=ofm_width, depth=ofm_depth)
+    npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
+
+    if cmd.weight_tensor is not None:
+        npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch)
+        if cmd.scale_tensor is not None:
+            npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch)
+    npu_op.activation = create_npu_activation(op)
+    npu_op.rounding_mode = get_rounding_mode(op)
+    npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
+
+    if not op.type.is_elementwise_op():
+        npu_op.padding = create_padding(cmd, op)
+        npu_op.kernel = to_npu_kernel(op.kernel)
+    npu_op.ifm_upscale = get_upscale(op)
+    npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
+    return npu_op
+
+
+def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
+    """Converts the command to NpuConv2DOperation"""
+    npu_op = NpuConv2DOperation()
+    set_common_op_fields(npu_op, cmd, arch)
+    if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
+        npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
+    else:
+        npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal]
+    return npu_op
+
+
+def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
+    """Converts the command to NpuConvDepthWiseOperation"""
+    npu_op = NpuConvDepthWiseOperation()
+    set_common_op_fields(npu_op, cmd, arch)
+    return npu_op
+
+
+def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
+    """Converts the command to NpuPoolingOperation"""
+    ps = cmd.ps
+    op = ps.primary_op
+    pool_op = NpuPoolingOp.AVERAGE
+    if op.type.is_maxpool_op():
+        pool_op = NpuPoolingOp.MAX
+    elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
+        pool_op = NpuPoolingOp.AVERAGE
+    elif op.type == Op.ReduceSum:
+        pool_op = NpuPoolingOp.REDUCE_SUM
+    else:
+        assert 0, f"Unknown pool type {op.type}"
+    npu_op = NpuPoolingOperation(pool_op)
+    set_common_op_fields(npu_op, cmd, arch)
+    # Pooling specific info
+    if op.type == Op.ResizeBilinear and "rescale" in op.attrs:
+        npu_op.rescale = op.attrs["rescale"]
+    return npu_op
+
+
+def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
+    """Converts the command to NpuElementWiseOperation"""
+    ps = cmd.ps
+    op = ps.primary_op
+    assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
+    elemwise_op = elementwise_op_map[op.type]
+    npu_op = NpuElementWiseOperation(elemwise_op)
+    if elemwise_op not in unary_elementwise_ops:
+        if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
+            # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
+            cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
+            cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
+            npu_op.reversed_operands = True
+        npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch)
+        npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
+        if cmd.ifm2_tensor.shape == []:
+            # scalar
+            assert cmd.ifm2_tensor.quant_values.size == 1
+            npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0)
+            npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
+        else:
+            box_shp = cmd.ifm2_box.get_size_shape()
+            height = box_shp[-3] if len(box_shp) >= 3 else 1
+            npu_op.ifm2.shape = NpuShape3D(height=height, width=cmd.ifm2_tensor.shape[-2], depth=box_shp[-1])
+    set_common_op_fields(npu_op, cmd, arch)
+    # Check if output scale needs to be overridden
+    output_scale = None
+    if op.type == Op.Add and "resizebilinear" in op.attrs:
+        # Force output scale same as the input scale for
+        # resizebilinear 1x1 that is converted to add
+        output_scale = npu_op.ifm2.quantization.scale_f32
+    if op.type == Op.LeakyRelu:
+        output_scale = op.attrs["alpha"]
+    if op.type in (Op.Add, Op.Sub) and "rescale" in op.attrs:
+        npu_op.rescale = op.attrs.get("rescale")
+    if op.type in (Op.Add, Op.Mul, Op.Sub):
+        if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
+            output_scale = 1 / 0x3000
+    if output_scale is not None:
+        npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
+    return npu_op
+
+
+def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
+    """Converts the command to NpuDmaOperation"""
+    src_region = get_region(cmd.in_tensor, arch)
+    if cmd.out_tensor.purpose == TensorPurpose.LUT:
+        dest_region = BasePointerIndex.Mem2Mem
+    else:
+        dest_region = get_region(cmd.out_tensor, arch)
+
+    start_coord = cmd.box.start_coord
+    src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
+    dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+
+    if cmd.in_tensor.compressed_values is not None:
+        if cmd.out_tensor.purpose == TensorPurpose.FSBias:
+            sz = cmd.in_tensor.storage_size()
+        else:
+            stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+            sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+    else:
+        sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
+    src = NpuAddressRange(src_region, int(src_addr), int(sz))
+    dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
+    return NpuDmaOperation(src, dest)
+
+
+def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
+    """Converts the high level command to NpuOperation"""
+    if cmd.cmdtype == CommandType.DMA:
+        npu_op = create_dma_op(cmd, arch)
+    elif cmd.cmdtype == CommandType.NpuStripe:
+        npu_block_type = cmd.ps.primary_op.type.npu_block_type
+        if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
+            npu_op = create_npu_conv2d_op(cmd, arch)
+        elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+            npu_op = create_npu_conv_depthwise_op(cmd, arch)
+        elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
+            npu_op = create_npu_pool_op(cmd, arch)
+        elif npu_block_type == NpuBlockType.ElementWise:
+            npu_op = create_npu_elementwise_op(cmd, arch)
+        else:
+            assert 0, f"Unknown command type {npu_block_type}"
+    # add a link to the high level command for debugging purposes
+    npu_op.cmd = cmd
+    return npu_op