ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
 # Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
 # all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
 # stream suitable for interpretation by the Ethos-U55 processor.
 from collections import defaultdict
 from collections import namedtuple
 from enum import Enum
 from enum import IntEnum

 import numpy as np

 from . import scaling
 from .architecture_features import ArchitectureFeatures
 from .architecture_features import Block
 from .architecture_features import Rect
 from .architecture_features import SharedBufferArea
 from .architecture_features import SHRAMElements
 from .data_type import BaseType
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .ethos_u55_regs.ethos_u55_regs import acc_format
 from .ethos_u55_regs.ethos_u55_regs import activation
 from .ethos_u55_regs.ethos_u55_regs import cmd0
 from .ethos_u55_regs.ethos_u55_regs import cmd1
 from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
 from .ethos_u55_regs.ethos_u55_regs import ifm_precision
 from .ethos_u55_regs.ethos_u55_regs import pooling_mode
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .ethos_u55_regs.ethos_u55_regs import rounding
 from .high_level_command_stream import CommandType
 from .numeric_util import clamp_sigmoid
 from .numeric_util import clamp_tanh
 from .numeric_util import full_shape
 from .numeric_util import quantise_float32
 from .numeric_util import round_away_zero
 from .numeric_util import round_up_to_int
 from .operation import NpuBlockType
 from .operation import Op
 from .tensor import MemType
 from .tensor import TensorBlockTraversal
 from .tensor import TensorFormat
 from .tensor import TensorPurpose


 class RegisterMachine:
     def __init__(self):
         self.n_banks = 1
         self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
         self.bank_idx = 0

     def set_register(self, reg, value):
         is_changed = self.registers[self.bank_idx][reg] != value
         self.registers[self.bank_idx][reg] = value
         # is_changed = True # force command
         return is_changed

     def switch_bank(self):
         self.bank_idx = (self.bank_idx + 1) % self.n_banks


 class CmdMode(IntEnum):
     NoPayload = 0x0000
     Payload32 = 0x4000
     Mask = 0xC000
     CmdOpMask = 0x03FF


 class BasePointerIndex(IntEnum):
     WeightTensor = 0  # base address index for the Weight tensor
     ScratchTensor = 1  # base address index for the Scratch_tensor in the TensorArena
     ScratchFastTensor = 2  # base address for the Scratch_fast_tensor
     Mem2Mem = (1 << 8) | (3 << 0)  # base address slot for memory 2 memory transfer


 # TODO: Replace with definitions from ethos_u55_regs
 class IFM2Broadcast(IntEnum):
     BroadcastHdim = 1 << 0
     BroadcastWdim = 1 << 1
     BroadcastCdim = 1 << 2
     ReverseOperandOrder = 1 << 6
     UseIFM2Scalar = 1 << 7


 class CommandStreamEmitter:
     WORD_SIZE = 4

     def __init__(self):
         self.cmd_stream = []
         self.reg_machine = [RegisterMachine(), RegisterMachine()]
         self.last_absolute_wait = defaultdict(int)
         self.offset = 0

     def get_reg_machine(self, cmd):
         if "DMA" in cmd.name:
             return self.reg_machine[1]
         else:
             return self.reg_machine[0]

     def size_in_bytes(self):
         sz = 0
         for cmd in self.cmd_stream:
             sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
         return sz

     def to_list(self):
         return [elem for cmd in self.cmd_stream for elem in cmd]

     def print_cmds(self):
         print("Code:    Command:                       Param: Payload:")
         for words_for_one_command in self.cmd_stream:
             code = words_for_one_command[0] & 0x0000FFFF  # lower 16 bits
             param = words_for_one_command[0] >> 16  # higher 16 bits

             payload_mode = CmdMode(code & CmdMode.Mask)

             # code and command
             s = "  0x%04x " % code
             if payload_mode == CmdMode.NoPayload:
                 s += str(cmd0(code & CmdMode.CmdOpMask))
             else:
                 s += str(cmd1(code & CmdMode.CmdOpMask))

             s = s.ljust(40)
             s += "%5d" % param

             # payload
             if payload_mode == CmdMode.Payload32:
                 s += "   0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
             else:
                 s += "   -"

             print(s)

     def cmd0_with_param(self, cmd, param):
         if isinstance(param, Enum):
             param = int(param.value)
         else:
             param = int(param)
         param = param & 0xFFFF
         command = cmd.value | (param << 16)
         if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
             return

         # This is not a redundant command, actually write it
         self.cmd_stream.append((command,))
         self.offset += CommandStreamEmitter.WORD_SIZE

     def cmd1_with_offset(self, cmd, offset, param=0x0):
         offset = int(offset) & 0xFFFFFFFFF
         command = cmd.value | CmdMode.Payload32.value | (param << 16)

         if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
             return

         # This is not a redundant command, actually write it
         self.cmd_stream.append((command, offset))
         self.offset += CommandStreamEmitter.WORD_SIZE * 2

     def cmd_wait(self, cmd, channel, outstanding_count):
         param = (16 * channel) + outstanding_count
         command = ((param & 0xFFFF) << 16) | cmd.value
         self.cmd_stream.append((command,))
         self.offset += CommandStreamEmitter.WORD_SIZE

     def cmd_do_operation(self, cmd, param=0):
         param = int(param)
         command = ((param & 0xFFFF) << 16) | cmd.value

         self.cmd_stream.append((command,))
         self.offset += CommandStreamEmitter.WORD_SIZE
         self.get_reg_machine(cmd).switch_bank()


 Watermark = namedtuple("Watermark", ["npu", "dma"])


 def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):
     cmd = cmd_stream[cmd_index]
     cmd_access = memory_accesses[cmd]
     index = cmd_index - 1

     # NPU dependency tracking
     npu_outstanding = -1
     npu_ops = 0
     npu_index = watermark.npu

     # DMA dependency tracking
     dma_outstanding = -1
     dma_ops = 0
     dma_index = watermark.dma

     # Seek back in the command stream looking for NPU or DMA dependencies
     # but only as far as the first dependency or the watermarks (dependencies
     # before this point have been satisfied already).
     # The watermark moves to after the latest element we must wait for, not
     # the command that issues the wait.
     # NPU->NPU dependency is handled via blockdep.
     while (index >= npu_index) or (index >= dma_index):
         prev_cmd = cmd_stream[index]
         prev_access = memory_accesses[prev_cmd]

         # Check DMA consuming NPU output
         if prev_cmd.cmdtype == CommandType.NpuStripe:
             if index >= npu_index:
                 if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):
                     npu_outstanding = npu_ops
                 npu_ops = npu_ops + 1  # Count NPU ops in the pipeline
                 if npu_ops >= arch.max_outstanding_kernels:
                     npu_index = max(index + 1, npu_index)

         # Check NPU consuming DMA output
         elif prev_cmd.cmdtype == CommandType.DMA:
             if index >= dma_index:
                 if cmd.cmdtype == CommandType.NpuStripe:
                     if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):
                         dma_outstanding = dma_ops
                 dma_ops = dma_ops + 1  # Count DMA ops in the pipeline
                 if dma_ops >= arch.max_outstanding_dma:
                     dma_index = max(index + 1, dma_index)

         index = index - 1

     # Update DMA watermark if we didn't see any and the NPU pipeline is full
     if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
         dma_index = cmd_index

     # Bring the search watermark forwards as we complete for those dependencies
     watermark = Watermark(npu_index, dma_index)
     outstanding = Watermark(npu_outstanding, dma_outstanding)

     return watermark, outstanding


 def has_prev_op_dependency(prev_cmd, cmd):
     if prev_cmd is None:
         return False
     if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
         if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor):
             return True
         elif cmd.ifm2_tensor is not None:
             return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor)
     return False


 def get_op_ofm_rect(cmd):
     start = full_shape(4, cmd.ofm_box.start_coord, 0)
     end = full_shape(4, cmd.ofm_box.end_coord, 1)
     return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)


 def get_op_ifm_rect(cmd):
     start = full_shape(4, cmd.ifm_box.start_coord, 0)
     end = full_shape(4, cmd.ifm_box.end_coord, 1)
     return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)


 def get_op_ifmofm_block_depth(arch, cmd):
     # Note: NOT equivalent to the normal ifm block depth calculation since
     # it takes into account 'depthless' block operations by returning full
     # depth
     if cmd.ps.npu_block_type in (
         NpuBlockType.ConvolutionDepthWise,
         NpuBlockType.Pooling,
         NpuBlockType.ElementWise,
         NpuBlockType.ReduceSum,
     ):
         return cmd.ofm_box.get_size_shape()[-1]

     return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)


 def get_op_padding_lt(cmd):
     if cmd.ps.npu_block_type not in (
         NpuBlockType.ConvolutionDepthWise,
         NpuBlockType.Pooling,
         NpuBlockType.ConvolutionMxN,
         NpuBlockType.ReduceSum,
     ):
         return (0, 0)

     explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)

     # Check if this is for horizontal ifm streaming
     if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
         explicit_padding[0] = cmd.pad_top
         explicit_padding[2] = cmd.pad_bottom

     return (explicit_padding[1], explicit_padding[0])


 def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
     if ifm_shape == []:
         # Scalar needs to be in IFM2
         return False
     elif ifm2_shape == []:
         return True

     for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
         if ifm != ifm2 and ifm == 1:
             # Broadcasted FM needs to be in IFM2
             return False

     return True


 def generate_register_command_stream(nng, sg, arch, verbose=False):
     emit = CommandStreamEmitter()

     if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
         base_ptr_idx_map = {
             MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
             MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
             MemType.Scratch: BasePointerIndex.ScratchTensor,
             MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
         }
     else:
         base_ptr_idx_map = {
             MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
             MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
             MemType.Scratch: BasePointerIndex.ScratchTensor,
             MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
         }

     # Maps an AccumulatorType enum to the corresponding acc_format value
     acc_format_map = {
         SHRAMElements.Acc16: acc_format.FP_S5_10.value,
         SHRAMElements.Acc32: acc_format.INT_32BIT.value,
         SHRAMElements.Acc40: acc_format.INT_40BIT.value,
     }

     # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
     elementwise_mode_map = {
         Op.Mul: elementwise_mode.MUL.value,
         Op.Add: elementwise_mode.ADD.value,
         Op.Sub: elementwise_mode.SUB.value,
         Op.Minimum: elementwise_mode.MIN.value,
         Op.Maximum: elementwise_mode.MAX.value,
         Op.LeakyRelu: elementwise_mode.LRELU.value,
         Op.Abs: elementwise_mode.ABS.value,
         Op.CLZ: elementwise_mode.CLZ.value,
         Op.SHR: elementwise_mode.SHR.value,
         Op.SHL: elementwise_mode.SHL.value,
     }

     cmd_stream = []
     memory_accesses = {}
     for cmd in sg.high_level_command_stream:
         if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
             print("Warning: Skipping register command stream generation for", cmd.ps)
         else:
             cmd_stream.append(cmd)
             memory_accesses[cmd] = cmd.get_memory_accesses()

     def emit_cmd_waits(cmd_waits):
         if cmd_waits.npu >= 0:
             emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

         if cmd_waits.dma >= 0:
             emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

     # Initialise operator dependency state
     prev_ifm_rect = cur_ifm_rect = None
     prev_ifm_block_depth = cur_ifm_block_depth = None
     prev_ofm_rect = cur_ofm_rect = None
     prev_ofm_block = cur_ofm_block = None
     prev_kernel = cur_kernel = None
     prev_cmd = None

     if arch.is_yoda_system:
         emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

     dep_watermark = Watermark(0, 0)

     stream_id = DebugDatabase.add_stream(sg)
     DebugDatabase.set_stream_offset(sg, 0)  # Default to zero, can only set during file writing

     for cmd_index, cmd in enumerate(cmd_stream):
         dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)

         if cmd.cmdtype == CommandType.DMA:
             start_coord = cmd.box.start_coord

             src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
             dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)

             if cmd.in_tensor.compressed_values is not None:
                 if cmd.out_tensor.purpose == TensorPurpose.FSBias:
                     sz = cmd.in_tensor.storage_size()
                 else:
                     stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
                     sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
             else:
                 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr

             emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
             if cmd.out_tensor.purpose == TensorPurpose.LUT:
                 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)
             else:
                 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])

             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
             dma_channel = 0
             mode = 0  # From external to external

             emit_cmd_waits(cmd_waits)
             emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)

         elif cmd.cmdtype == CommandType.NpuStripe:

             ps = cmd.ps
             primary_op = ps.primary_op
             npu_block_type = ps.npu_block_type
             # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
             use_global_scale = False
             # Specifies type of rounding to be used.
             rounding_mode = (
                 rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL
             )
             if primary_op.type == Op.ResizeBilinear:
                 rounding_mode = rounding.TRUNCATE
             fmf = primary_op.memory_function
             faf = primary_op.activation
             fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
             # Force output scale, used in operations with fused LUT
             # Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization
             # except when primary_op is AddAct + 0 (no-op) + LUT
             forced_ofm_quantization = primary_op.forced_output_quantization
             ofm_quant = cmd.ofm_tensor.quantization
             if forced_ofm_quantization is not None:
                 ofm_quant = forced_ofm_quantization

             # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
             op_to_scale = 0

             # Update state history
             prev_ifm_rect = cur_ifm_rect
             prev_ifm_block_depth = cur_ifm_block_depth
             prev_ofm_rect = cur_ofm_rect
             prev_ofm_block = cur_ofm_block
             prev_kernel = cur_kernel
             cur_kernel = ps.primary_op.kernel if ps.primary_op else None

             block_config = ps.block_config
             emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
             emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
             emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)

             shared_buffer = ps.shared_buffer

             if npu_block_type == NpuBlockType.ElementWise:
                 ifm2_broadcast = 0

                 if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
                     # The scalar has to be the ifm2 tensor so switch the ifms
                     cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
                     cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box

                     # Set ReverseOperandOrder bit to IFM2_BROADCAST
                     ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

                 # Calculate scales needed for arithmetic elementwise operators
                 if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)):
                     input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None
                     input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None
                     output_scale = ofm_quant.scale_f32 if ofm_quant else None
                     use_global_scale = True

                     if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh):
                         output_scale = 1 / 0x3000

                     if primary_op.type == Op.Mul:
                         if None in (input_scale, input2_scale, output_scale):
                             ofm_scale = 1
                             shift = 0
                         else:
                             ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
                         emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
                     else:  # AddAct/SubAct
                         # Force output scale same as the input scale for
                         # resizebilinear 1x1 that is converted to add
                         if "resizebilinear" in primary_op.attrs:
                             output_scale = input2_scale

                         if None in (input_scale, input2_scale, output_scale):
                             opa_scale = opb_scale = ofm_scale = 1
                             opa_shift = shift = 0
                             ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0])
                         elif input_scale == input2_scale:
                             opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
                                 input_scale, input2_scale, output_scale
                             )
                             opa_shift = 0  # Unused for this case
                         else:
                             # Use advanced implementation only when input scales differ
                             bitdepth = cmd.ifm_tensor.dtype.bits
                             (
                                 opa_scale,
                                 opa_shift,
                                 ofm_scale,
                                 shift,
                                 op_to_scale,
                             ) = scaling.advanced_elementwise_add_sub_scale(
                                 input_scale, input2_scale, output_scale, bitdepth
                             )
                             opb_scale = 0  # Unused for this case
                             if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
                                 # If the operand order is reversed we also have to swap which operand is scaled
                                 if op_to_scale == scaling.OperandToScale.OPa:
                                     op_to_scale = scaling.OperandToScale.OPb
                                 else:
                                     op_to_scale = scaling.OperandToScale.OPa

                         emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
                         emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
                         emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

                 elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)):
                     output_scale = ofm_quant.scale_f32
                     use_global_scale = True

                     if primary_op.type == Op.LeakyRelu:
                         output_scale = primary_op.attrs["alpha"]

                     ofm_scale, shift = scaling.quantise_scale(output_scale)
                     emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
                 else:
                     emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

                 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
                 uses_lut = primary_op.activation_lut is not None
                 shram_required = arch.available_shram_banks(uses_lut)
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

                 # Acc buffers not needed so set AB_START to size of SHRAM
                 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

                 # Is not a unary operator
                 if cmd.ifm2_tensor is not None:
                     if cmd.ifm2_tensor.shape == []:
                         # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
                         ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
                     else:
                         ifm_box_shape = cmd.ifm_box.get_size_shape()
                         ifm2_box_shape = cmd.ifm2_box.get_size_shape()

                         if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
                             # Broadcast in 'H' dimension
                             assert cmd.ifm2_tensor.shape[1] == 1
                             ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

                         if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
                             # Broadcast in 'W' dimension
                             assert cmd.ifm2_tensor.shape[2] == 1
                             ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

                         if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
                             # Broadcast in 'C' dimension
                             assert cmd.ifm2_tensor.shape[3] == 1
                             ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

                         # Set IFM2_IB_START to the latter half of the IB space
                         ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
                         emit.cmd0_with_param(
                             cmd0.NPU_SET_IFM2_IB_START,
                             (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
                         )

                     emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

             else:
                 emit.cmd0_with_param(
                     cmd0.NPU_SET_IFM_IB_END,
                     shared_buffer.bank_locations[SharedBufferArea.IFM]
                     + shared_buffer.banks_required[SharedBufferArea.IFM],
                 )
                 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

             emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

             if primary_op.type == Op.ResizeBilinear:
                 # perform nearest neighbor upscale
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
             elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
                 # perform insert zero upscale
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
             else:
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)

             if npu_block_type in set(
                 (
                     NpuBlockType.ConvolutionMxN,
                     NpuBlockType.ConvolutionDepthWise,
                     NpuBlockType.Pooling,
                     NpuBlockType.ReduceSum,
                 )
             ):
                 # Set up padding
                 explicit_padding = list(primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)

                 # Check if this is for horizontal ifm streaming
                 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
                     explicit_padding[0] = cmd.pad_top
                     explicit_padding[2] = cmd.pad_bottom

                 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
                 # because of activation function needed to be fused.
                 if cmd.ifm_box.start_coord[-2] > 0:
                     explicit_padding[1] = 0
                 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
                     explicit_padding[3] = 0
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])

                 # set kernel x stride low bit
                 stride = primary_op.attrs["strides"][2] - 1 & 1
                 # set kernel y stride low bit
                 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
                 # set kernel x stride extension bits
                 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
                 # set kernel y stride extension bits
                 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9

                 if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
                     k_height, k_width = primary_op.attrs["ksize"][1:3]
                     emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
                     emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)

                     valid_padding = sum(explicit_padding) == 0

                     if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding:
                         # For valid padding vela has to output scaling values
                         if faf == Op.Sigmoid or faf == Op.Tanh:
                             rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
                             if cmd.ifm_tensor.dtype == DataType.int16:
                                 # Calculate scale and shift for the output scale of 1/(3*4096)
                                 shift = 0
                                 max_rescale = np.iinfo(np.int16).max / 2
                                 while rescale <= max_rescale and shift <= 30:
                                     shift += 1
                                     rescale *= 2
                                 scale = int(rescale)
                             else:
                                 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
                                 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
                                 scale = int(round_away_zero(scale * rescale))
                         elif fused_quantize:
                             # Quantize op requires different scaling
                             ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
                             ofm_scale_f64 = np.double(ofm_quant.scale_f32)
                             scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
                         elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs:
                             rescale = primary_op.attrs["rescale"]
                             rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
                             scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
                             scale = int(round_away_zero(scale * rescale))
                         else:
                             # In case avg pool fused with concat or other memory operation, rescaling might be needed.
                             # k_height == k_width == 1 is allways true in this case
                             # Normally the scale is maximised, to get maximum precision, which means that
                             # if rescale != 1, scale need to consider the number of bits needed for rescaling
                             if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,):
                                 rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32
                                 rescale_bits = 0
                                 if k_height == k_width == 1:
                                     if fmf == Op.ConcatSliceWrite:
                                         rounding_mode = rounding.NATURAL
                                     if rescale > 1:
                                         rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
                                     elif rescale < 1:
                                         rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
                                 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
                                 scale = int(round_away_zero(scale * rescale))
                             else:
                                 scale = 1
                                 shift = 0

                         emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
                         # Valid-padded average pool should use the global scale from
                         # NPU_SET_OFM_SCALE register, which is set above.
                         use_global_scale = True

                 else:  # Convolution
                     assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
                     # Reduced precision quantization and natural rounding used for int16
                     if cmd.ifm_tensor.dtype == DataType.int16:
                         rounding_mode = rounding.NATURAL
                     stride |= (cur_kernel.dilation.y - 1) << 4
                     stride |= (cur_kernel.dilation.x - 1) << 3
                     emit.cmd0_with_param(
                         cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
                     )
                     emit.cmd0_with_param(
                         cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
                     )
                     if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
                         # Part-kernel-first weight ordering
                         assert npu_block_type == NpuBlockType.ConvolutionMxN
                         stride |= 1 << 2

                 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

             elif npu_block_type in set((NpuBlockType.VectorProduct,)):
                 # Vector product is implemented using a 1x1 convolution so need
                 # to setup the appropriate padding and kernel info
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)

                 # kernel stride reg = 0 means stride(1,1) + depth first weight
                 # order + dilation(0,0) + kernel_split_size=8
                 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)

                 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
                 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)

             if npu_block_type in set(
                 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
             ):
                 # Emit Weight base address commands, only maps the area required for
                 # this command's weights from the larger tensor.
                 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
                 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
                 substreams = len(weight_substream_offsets) - 1  # Offset list must terminate with full stream length

                 # Extract weight substream offsets and calculate their lengths
                 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
                 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)

                 # Set weights sources for active and present cores
                 for core, param in enumerate(
                     [
                         (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
                         (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
                     ]
                 ):
                     if core < substreams:
                         emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])
                         emit.cmd1_with_offset(
                             param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]
                         )
                     elif core < arch.ncores:
                         emit.cmd1_with_offset(param[0], weight_addr)
                         emit.cmd1_with_offset(param[1], 0)

                 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
                 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)

                 # Emit Scale & Bias base address commands, with length matching the amount required by
                 # the weight tensors.
                 if cmd.scale_tensor is not None:
                     scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
                     substreams = len(scale_substream_offsets) - 1  # Offset list must terminate with full stream length

                     # Extract scale substream offsets and calculate their lengths
                     assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
                     scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])

                     # Set scale sources for active and present cores
                     for core, param in enumerate(
                         [
                             (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
                             (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),
                         ]
                     ):
                         if core < substreams:
                             emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])
                             emit.cmd1_with_offset(
                                 param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]
                             )
                         elif core < arch.ncores:
                             emit.cmd1_with_offset(param[0], scale_addr)
                             emit.cmd1_with_offset(param[1], 0)

                     # Emit base address for NPU to access scale & bias data
                     scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
                     emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)

             ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min
             ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max
             ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min
             ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max

             # Emit commands for any fused activation function
             if faf is None:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
                 # Even if no activation function, values need to be set to override previous values
                 faf_min = ofm_quant_qmin
                 faf_max = ofm_quant_qmax
             elif faf == Op.Relu:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
                 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                 faf_max = ofm_quant_qmax
             elif faf == Op.Relu6:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
                 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
             elif faf == Op.ReluN1To1:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
                 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
             elif faf == Op.Tanh:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
                 if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
                     faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                     faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                 else:
                     faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
                     faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
             elif faf == Op.Sigmoid:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
                 if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
                     faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
                     faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
                 else:
                     faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
                     faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
             elif faf == Op.LUT:
                 lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)
                 assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."
                 if cmd.ofm_tensor.dtype == DataType.int32:
                     lut_index |= 3 << 12  # Force I8 range
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)
                 faf_min = ofm_quant_qmin
                 faf_max = ofm_quant_qmax
             else:
                 raise Exception("Unsupported fused_activation_function = " + faf.name)

             # Activation range needs to be set based upon the quantisation range and the fused activation range
             emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
             emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))

             out_shape = cmd.ofm_box.get_size_shape()
             if len(out_shape) >= 4:
                 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
             else:
                 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
             if len(out_shape) >= 2:
                 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
             else:
                 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
             emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)

             if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):
                 in_shape = cmd.ifm_box.get_size_shape()
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
             else:
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)

             for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
                 (
                     cmd.ifm_tensor,
                     cmd.ifm_box,
                     cmd0.NPU_SET_IFM_REGION,
                     (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
                     (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
                     cmd0.NPU_SET_IFM_ZERO_POINT,
                 ),
                 (
                     cmd.ifm2_tensor,
                     cmd.ifm2_box,
                     cmd0.NPU_SET_IFM2_REGION,
                     (
                         cmd1.NPU_SET_IFM2_BASE0,
                         cmd1.NPU_SET_IFM2_BASE1,
                         cmd1.NPU_SET_IFM2_BASE2,
                         cmd1.NPU_SET_IFM2_BASE3,
                     ),
                     (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
                     cmd0.NPU_SET_IFM2_ZERO_POINT,
                 ),
                 (
                     cmd.ofm_tensor,
                     cmd.ofm_box,
                     cmd0.NPU_SET_OFM_REGION,
                     (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
                     (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
                     cmd0.NPU_SET_OFM_ZERO_POINT,
                 ),
             ):

                 if tens is None:
                     continue

                 need_zero_point = (
                     (faf is not None and forced_ofm_quantization is None)
                     or (fmf == Op.ConcatSliceWrite)
                     or fused_quantize
                 )
                 if (
                     (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point)
                     or (
                         tens.dtype == DataType.int32
                         and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT)
                     )
                     or tens.quantization is None
                 ):
                     # Actual integer operation, just set scale to 1 and zero point to 0
                     emit.cmd0_with_param(zero_point_op, 0)
                 else:
                     assert tens.quantization.zero_point is not None, "need an actual zero point set"
                     if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None:
                         zero_point = forced_ofm_quantization.zero_point
                     elif (
                         "resizebilinear" in primary_op.attrs
                         and primary_op.type == Op.Add
                         and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
                     ):
                         # Force output zero point same as the input zero point
                         # for resizebilinear 1x1 that is converted to add
                         zero_point = cmd.ifm2_tensor.quantization.zero_point
                     else:
                         zero_point = tens.quantization.zero_point
                     emit.cmd0_with_param(zero_point_op, int(zero_point))

                 if tens.shape == []:
                     # Empty shape, elementwise constant
                     ifm2_scalar = tens.quant_values
                     assert ifm2_scalar.size == 1
                     emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
                     continue

                 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
                     box.start_coord, box.end_coord
                 )
                 if npu_block_type != NpuBlockType.VectorProduct:
                     if tens == cmd.ifm_tensor:
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
                     elif tens == cmd.ofm_tensor:
                         emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
                     if tens == cmd.ifm2_tensor:
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
                         emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
                 else:
                     if len(out_shape) == 2:
                         assert out_shape[0] == 1
                         if tens == cmd.ifm_tensor:
                             emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0)
                         elif tens == cmd.ofm_tensor:
                             emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0)
                     else:
                         assert False

                 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])

                 for idx, addr in enumerate(addresses):
                     if addr is None:
                         addresses[idx] = 0

                 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
                 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
                 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
                 emit.cmd1_with_offset(ptr_ops[3], addresses[3])

                 strides = tens.get_strides()
                 emit.cmd1_with_offset(stride_ops[0], strides[1])  # stride between 16-byte channel blocks (C)
                 emit.cmd1_with_offset(stride_ops[2], strides[3])  # stride between horisontal values (W)
                 emit.cmd1_with_offset(stride_ops[1], strides[2])  # stride between vertical values (H)

                 if tens.format == TensorFormat.NHCWB16:
                     # Check that all BasePointer addresses are aligned to 16 bytes
                     assert (int(addresses[0]) % 16) == 0
                     assert (int(addresses[1]) % 16) == 0
                     assert (int(addresses[2]) % 16) == 0
                     assert (int(addresses[3]) % 16) == 0

             ofm_dtype = cmd.ofm_tensor.dtype
             assert ofm_dtype.type & BaseType.Int
             prec = 0
             if ofm_dtype.size_in_bits() == 8:
                 prec = 0
             elif ofm_dtype.size_in_bits() == 16:
                 prec = 2
             elif ofm_dtype.size_in_bits() == 32:
                 prec = 4
             else:
                 assert 0

             if ofm_dtype.type & BaseType.Signed:
                 prec += 1

             if use_global_scale:
                 # Set global scale bit, as opposed to using per channel scale
                 prec |= 1 << 8

             if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
                 prec |= 1 << 6

             prec |= rounding_mode.value << 14

             emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

             prec = None
             weight_bits = 8
             if cmd.weight_tensor is not None:
                 weight_bits = cmd.weight_tensor.dtype.size_in_bits()

             ifm_dtype = cmd.ifm_tensor.dtype

             assert weight_bits == 8, "Unsupported weight bit depth"
             assert (
                 ifm_dtype.size_in_bits() in {8, 16}
                 or ifm_dtype.size_in_bits() == 32
                 and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)
             ), "Unsupported ifm bit depth"

             if ifm_dtype.size_in_bits() == 8:
                 if ifm_dtype.type & BaseType.Signed:
                     prec = ifm_precision.S8
                 else:
                     prec = ifm_precision.U8
             elif ifm_dtype.size_in_bits() == 16:
                 if ifm_dtype.type & BaseType.Signed:
                     prec = ifm_precision.S16
                 else:
                     prec = ifm_precision.U16
             elif ifm_dtype == DataType.int32:
                 prec = ifm_precision.S32

             ifm_prec = prec.value
             ifm2_prec = ifm_prec

             if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
                 ifm_prec |= 1 << 6

             ifm_prec |= op_to_scale << 8

             emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)

             if cmd.ifm2_tensor is not None:
                 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
                     ifm2_prec |= 1 << 6
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)

             # Get op parameters
             cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
             cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
             cur_ofm_rect = get_op_ofm_rect(cmd)
             cur_ifm_rect = get_op_ifm_rect(cmd)
             cur_padLT = get_op_padding_lt(cmd)
             if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
                 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
                     blockdep = arch.calc_block_dep(
                         prev_ifm_rect,
                         prev_ofm_rect,
                         prev_ifm_block_depth,
                         prev_ofm_block,
                         prev_kernel,
                         cur_ifm_rect,
                         cur_ofm_rect,
                         cur_ifm_block_depth,
                         cur_ofm_block,
                         cur_kernel,
                         cur_padLT,
                     )
                 else:
                     blockdep = 0
             else:
                 blockdep = ArchitectureFeatures.MAX_BLOCKDEP

             # Set between every op (dependent or not)
             blockdep = min(blockdep, arch.max_blockdep)
             emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
             prev_cmd = cmd

             emit_cmd_waits(cmd_waits)
             DebugDatabase.add_command(stream_id, emit.offset, primary_op)

             if npu_block_type == NpuBlockType.ConvolutionMxN:
                 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
             elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
                 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
             elif npu_block_type == NpuBlockType.VectorProduct:
                 # Vector product is implemented using a 1x1 convolution
                 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
             elif npu_block_type == NpuBlockType.Pooling:
                 param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value
                 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
             elif npu_block_type == NpuBlockType.ReduceSum:
                 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
             elif npu_block_type == NpuBlockType.ElementWise:
                 param = elementwise_mode_map[primary_op.type]
                 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
             else:
                 print("Warning: Skipping register command stream generation for", ps)

     # Fill in final part of command stream:
     emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

     sg.register_command_stream = emit.to_list()
     if verbose:
         emit.print_cmds()
         print("number of commands", len(emit.cmd_stream))
         print("command stream length in words", len(sg.register_command_stream))