MLBEDSW-5383 npu_find_block_configs() differs between v2.1.1 and v3.1.0

* 1D optimised block_config was incorrectly beign set to the ArchitectureBlockConfig in try_block_config()
* Write external API test for the reduced block height case (on H256)

Signed-off-by: James Ward <james.ward@arm.com>
Change-Id: I9ced7eb31b23730e4423aabbaf769bc72fac8fc9
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 30e1c87..65a684c 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -47,7 +47,7 @@
     def __init__(self):
         self.layout = SHRAMLayout()
         self.ifm_block = Shape4D()
-        self.ofm_block = Shape4D()
+        self.ofm_block = Shape4D()  # non-1D-optimised block
         self.acc_type = SHRAMElements.Acc32
         self.is_partkernel = False
         self.bank_size = 0
@@ -414,10 +414,10 @@
         ifm_block = ifm_block.with_depth(ifm_blockdepth)
 
     # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
-    block_config = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
+    block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
 
     layout = _try_block_config(
-        arch.shram, ew_usage, block_config, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
+        arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
     )
     if layout is None:
         return None
diff --git a/ethosu/vela/test/extapi/test_extapi_find_block_configs.py b/ethosu/vela/test/extapi/test_extapi_find_block_configs.py
index 07cb9cb..a768f18 100644
--- a/ethosu/vela/test/extapi/test_extapi_find_block_configs.py
+++ b/ethosu/vela/test/extapi/test_extapi_find_block_configs.py
@@ -61,3 +61,39 @@
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, op.block_config.height - 1)
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, op.block_config.width - 1)
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, op.block_config.depth - 1)
+
+
+def test_conv2d_block_height_1():
+    """Test npu_find_block_configs returns valid config in the special case of reduced ublock height (H256)."""
+    # Create a Conv2D operation
+    op = NpuConv2DOperation()
+    op.ifm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1024),
+        1,
+        512,
+        quant=NpuQuantization(scale_f32=0.023528477177023888, zero_point=0),
+    )
+    op.ofm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1001),
+        1,
+        0x14E40,
+        quant=NpuQuantization(scale_f32=0.16609922051429749, zero_point=66),
+    )
+    op.kernel = NpuKernel(1, 1, 1, 1, 1, 1)
+    op.padding = NpuPadding(top=0, left=0, right=0, bottom=0)
+    op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
+
+    # Find valid block configs
+    accelerator = NpuAccelerator.Ethos_U55_256
+    block_configs = npu_find_block_configs(op, accelerator)
+    # Select the last one
+    op.block_config = block_configs[-1]
+    # Note: the weights should be encoded with op.block_config.depth (not shown here)
+    op.weights = [NpuAddressRange(region=0, address=0, length=7696)]
+
+    # Check that generating register commands succeeds
+    cmds = npu_generate_register_command_stream([op], accelerator)
+    # Check that the selected block config was used
+    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, op.block_config.height - 1)
+    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, op.block_config.width - 1)
+    check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, op.block_config.depth - 1)