Add Vela codebase

 - Added modules ethosu.vela and ethosu.mlw_codec.
 - Added README and various configuration files.

Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..300bef6
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,9 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+ethos-u-vela = {editable = true,path = "."}
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..6fa0154
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,56 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "2d930644f3f81f11dae3317cae890fe083479342c80da44161b46ac83d6972d5"
+        },
+        "pipfile-spec": 6,
+        "requires": {},
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "ethos-u-vela": {
+            "editable": true,
+            "path": "."
+        },
+        "flatbuffers": {
+            "hashes": [
+                "sha256:776a959c5f70b41819fa75de44ed14fd984fa1a79b378f27e6f4fff338cbdca2",
+                "sha256:f24185db54193540e3d684dc98aa7c2d89882341641548ceb36fd2589fef6c4e"
+            ],
+            "version": "==1.11.0"
+        },
+        "numpy": {
+            "hashes": [
+                "sha256:1598a6de323508cfeed6b7cd6c4efb43324f4692e20d1f76e1feec7f59013448",
+                "sha256:1b0ece94018ae21163d1f651b527156e1f03943b986188dd81bc7e066eae9d1c",
+                "sha256:2e40be731ad618cb4974d5ba60d373cdf4f1b8dcbf1dcf4d9dff5e212baf69c5",
+                "sha256:4ba59db1fcc27ea31368af524dcf874d9277f21fd2e1f7f1e2e0c75ee61419ed",
+                "sha256:59ca9c6592da581a03d42cc4e270732552243dc45e87248aa8d636d53812f6a5",
+                "sha256:5e0feb76849ca3e83dd396254e47c7dba65b3fa9ed3df67c2556293ae3e16de3",
+                "sha256:6d205249a0293e62bbb3898c4c2e1ff8a22f98375a34775a259a0523111a8f6c",
+                "sha256:6fcc5a3990e269f86d388f165a089259893851437b904f422d301cdce4ff25c8",
+                "sha256:82847f2765835c8e5308f136bc34018d09b49037ec23ecc42b246424c767056b",
+                "sha256:87902e5c03355335fc5992a74ba0247a70d937f326d852fc613b7f53516c0963",
+                "sha256:9ab21d1cb156a620d3999dd92f7d1c86824c622873841d6b080ca5495fa10fef",
+                "sha256:a1baa1dc8ecd88fb2d2a651671a84b9938461e8a8eed13e2f0a812a94084d1fa",
+                "sha256:a244f7af80dacf21054386539699ce29bcc64796ed9850c99a34b41305630286",
+                "sha256:a35af656a7ba1d3decdd4fae5322b87277de8ac98b7d9da657d9e212ece76a61",
+                "sha256:b1fe1a6f3a6f355f6c29789b5927f8bd4f134a4bd9a781099a7c4f66af8850f5",
+                "sha256:b5ad0adb51b2dee7d0ee75a69e9871e2ddfb061c73ea8bc439376298141f77f5",
+                "sha256:ba3c7a2814ec8a176bb71f91478293d633c08582119e713a0c5351c0f77698da",
+                "sha256:cd77d58fb2acf57c1d1ee2835567cd70e6f1835e32090538f17f8a3a99e5e34b",
+                "sha256:cdb3a70285e8220875e4d2bc394e49b4988bdb1298ffa4e0bd81b2f613be397c",
+                "sha256:deb529c40c3f1e38d53d5ae6cd077c21f1d49e13afc7936f7f868455e16b64a0",
+                "sha256:e7894793e6e8540dbeac77c87b489e331947813511108ae097f1715c018b8f3d"
+            ],
+            "version": "==1.18.2"
+        }
+    },
+    "develop": {}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..03ad7fe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,112 @@
+# Vela
+This tool is used to compile a [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers) neural network model into an optimised version that can run on an embedded system containing an [Ethos-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55).
+
+The optimised model will contain TensorFlow Lite Custom operators for those parts of the model that can be accelerated by the Ethos-U55. Parts of the model that cannot be accelerated are left unchanged and will instead run on the Cortex-M series CPU using an appropriate kernel (such as the [Arm](https://www.arm.com) optimised [CMSIS-NN](https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN) kernels).
+
+After compilation the optimised model can only be run on an Ethos-U55 NPU embedded system.
+
+The tool will also generate performance estimates (EXPERIMENTAL) for the compiled model.
+
+## Environment
+Vela runs on the Linux operating system.
+
+## Prerequisites
+The following should be installed prior to the installation of Vela:
+ - Python >= 3.6
+ - GNU toolchain (GCC, Binutils and libraries) or alternative C compiler/linker toolchain
+
+## Installation
+Before running, the Vela package must be installed along with all its dependencies. To do this, first change to the directory that contains this README.md file. Then use the command:
+```
+pip3 install -U setuptools>=40.1.0
+pip3 install .
+```
+
+Or, if you use the `pipenv` virtual environment tool:
+```
+pipenv install .
+```
+
+## Running
+Vela is run with an input `.tflite` file passed on the command line. This file contains the neural network to be compiled. The tool then outputs an optimised version with a `_vela.tflite` file prefix, along with the performance estimate (EXPERIMENTAL) CSV files, all to the output directory.
+
+If you use the `pipenv` virtual environment tool then first start by spawning a shell in the virtual environment.:
+```
+pipenv shell
+```
+After which running Vela is the same regardless of whether you are in a virtual environment or not.
+
+Example usage:
+1) Compile the network `my_model.tflite`. The optimised version will be output to `./output/my_network_vela.tflite`.
+```
+vela my_model.tflite
+```
+2) Compile the network `/path/to/my_model.tflite` and specify the output to go in the directory `./results_dir/`.
+```
+vela --output-dir ./results_dir /path/to/my_model.tflite
+```
+3) To get a list of all available options:
+```
+vela --help
+```
+4) To specifiy information about the embedded system's configuration use Vela's system configuration file. The following command selects the `MySysConfig` settings that are described in the `sys_cfg_vela.ini` system configuration file. More details can be found in the next section.
+```
+vela --config sys_cfg_vela.ini --system-config MySysConfig my_model.tflite
+```
+
+### Vela's System Configuration file
+This is used to describe various properties of the embedded system that the network will run in.
+
+Example of a Vela system configuration file.
+```
+; File: sys_cfg_vela.ini
+; The file contains two parts; a system config part and a CPU operator
+; performance part.
+
+; System config
+; Specifies properties such as the core clock speed, the size and speed of the
+; four potential memory areas, and for various types of data which memory area
+; is used to store them. The cpu property is used to link with the CPU operator
+; performance.
+; The four potential memory areas are: Sram, Dram, OnChipFlash, OffChipFlash.
+
+[SysConfig.MySysConfig]
+npu_freq=500e6
+cpu=MyCpu
+Sram_clock_scale=1
+Sram_port_width=64
+Dram_clock_scale=1
+Dram_port_width=64
+OnChipFlash_clock_scale=1
+OnChipFlash_port_width=64
+OffChipFlash_clock_scale=0.25
+OffChipFlash_port_width=32
+permanent_storage_mem_area=OffChipFlash
+feature_map_storage_mem_area=Sram
+fast_storage_mem_area=Sram
+
+; CPU operator performance
+; Specifies properties that are used by a linear model to estimate the
+; performance for any operations that will be run on the CPU (such as those not
+; supported by the NPU). Setting the intercept and slope to 0 will result in
+; the operator being excluded from the performance estimation. This is the same
+; as not specifying the operator. If an explicit cpu is specified rather than
+; using the default then the cpu name must match the cpu specified in the
+; SysConfig.<system config name> section.
+
+[CpuPerformance.MyCpuOperator]
+default.intercept=0.0
+default.slope=1.0
+
+MyCpu.intercept=0.0
+MyCpu.slope=1.0
+```
+
+## Contribution Guidlines and Pull Requests
+Contributions are accepted under [Apache License 2.0](LICENSE.txt). Only submit contributions where you have authored all of the code.
+
+## Resources
+* [Ethos-U55](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55)
+
+## License
+Vela is licensed under [Apache License 2.0](LICENSE.txt)
diff --git a/ethosu/mlw_codec/makefile b/ethosu/mlw_codec/makefile
new file mode 100644
index 0000000..6eb418d
--- /dev/null
+++ b/ethosu/mlw_codec/makefile
@@ -0,0 +1,49 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Makefile to build mlw_codec
+
+UNAME=$(shell uname -o)
+
+CFLAGS=-Wall -Wno-unused-function -Wno-unused-variable
+
+ifeq ($(DEBUG),1)
+    CFLAGS+=-g -O0 -DDEBUG
+else
+    CFLAGS+=-O3
+endif
+
+LIBSRCS=mlw_encode.c mlw_decode.c
+LIBHDRS=mlw_encode.h mlw_decode.h mlw_common.h
+
+ifeq ($(UNAME),Cygwin)
+    MLWEXE=mlw_codec.exe
+else
+    MLWEXE=mlw_codec
+endif
+
+all: mlwexe
+
+.PHONY: mlwexe
+mlwexe: $(MLWEXE)
+
+clean:
+	rm -f $(MLWEXE)
+
+$(MLWEXE): mlw_main.c $(LIBSRCS) $(LIBHDRS) makefile
+	gcc $(CFLAGS) mlw_main.c $(LIBSRCS) -o $(MLWEXE) -lm
diff --git a/ethosu/mlw_codec/mlw_codecmodule.c b/ethosu/mlw_codec/mlw_codecmodule.c
new file mode 100644
index 0000000..de945ab
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_codecmodule.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "mlw_decode.h"
+#include "mlw_encode.h"
+
+/* C extension wrapper for mlw_encode
+ *
+ * This method is exposed directly in python with the arguments with a
+ * prototype of the form:
+ *
+ * output = mlw_codec.encode(input, verbose=0)
+ *
+ * input: [int]
+ * verbose: int
+ * output: bytearray
+ */
+
+static PyObject *
+method_encode (PyObject *self, PyObject *args)
+{
+  /* Object to hold the input integer list. */
+  PyObject *input_list_object;
+
+  /* Object to hold the input verbosity integer, the verbose argument
+   * is optional so defaulted to 0.
+   */
+  int verbose = 0;
+
+  /* Arguments to the method are delivered as a tuple, unpack the
+   * tuple to get the individual arguments, note the second is
+   * optional.
+   */
+  if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &verbose))
+    return NULL;
+
+  /* Unpack the length of the input integer list.  */
+  int input_length = PyObject_Length (input_list_object);
+  if (input_length < 0)
+    input_length = 0;
+
+  /* We need to marshall the integer list into an input buffer
+   * suitable for mlw_encode, use a temporary heap allocated buffer
+   * for that purpose.
+   */
+  int16_t *input_buffer = (int16_t *) malloc(sizeof(int16_t *) * input_length);
+  if (input_buffer == NULL)
+    return PyErr_NoMemory();
+
+  /* Unpack the input integer list into the temporary buffer.
+   */
+  for (int i = 0; i < input_length; i++)
+    {
+      PyObject *item;
+      item = PyList_GetItem(input_list_object, i);
+      if (!PyLong_Check(item))
+        input_buffer[i] = 0;
+      input_buffer[i] = PyLong_AsLong(item);
+    }
+
+  /* We don't know the output length required, we guess worst case,
+   * the mlw_encode call will do a resize (downwards) anyway.
+   */
+  uint8_t *output_buffer = malloc(input_length);
+  if (output_buffer == NULL)
+    return PyErr_NoMemory();
+
+  int output_length = mlw_encode(input_buffer, input_length, &output_buffer, verbose);
+
+  PyObject *output_byte_array = PyByteArray_FromStringAndSize ((char *) output_buffer, output_length);
+
+  /* Discard the temporary input and output buffers.  */
+  free (input_buffer);
+  free (output_buffer);
+
+  return output_byte_array;
+}
+
+/* C extension wrapper for mlw_decode
+ *
+ * This method is exposed directly in python with the arguments with a
+ * prototype of the form:
+ *
+ * output = mlw_codec.decode(input, verbose=0)
+ *
+ * input: bytearray
+ * verbose: int
+ * output: [int]
+ */
+
+static PyObject *
+method_decode(PyObject *self, PyObject *args)
+{
+  /* Object to hold the input bytearray. */
+  PyObject *input_bytearray_object;
+
+  /* Object to hold the input verbosity integer, the verbose argument
+   * is optional so defaulted to 0.
+   */
+  int verbose = 0;
+
+  /* Arguments to the method are delivered as a tuple, unpack the
+   * tuple to get the individual arguments, note the second is
+   * optional.
+   */
+  if (!PyArg_ParseTuple(args, "Y|i", &input_bytearray_object, &verbose))
+    return NULL;
+
+  /* Unpack the input buffer and length from the bytearray object.  */
+  uint8_t *input_buffer = (uint8_t *) PyByteArray_AsString(input_bytearray_object);
+  int input_length = PyByteArray_Size(input_bytearray_object);
+
+  /* We don't know the output length required, we guess, but the guess
+   * will be too small, the mlw_decode call will do a resize (upwards)
+   * anyway.
+   */
+  int16_t *output_buffer = malloc (input_length);
+  if (output_buffer == NULL)
+    return PyErr_NoMemory();
+
+  int output_length = mlw_decode (input_buffer, input_length, &output_buffer, verbose);
+
+  /* Construct a new integer list and marshall the output buffer
+   * contents into the list.  */
+  PyObject *output_list = PyList_New(output_length);
+  for (int i = 0; i <output_length; i++)
+    PyList_SetItem (output_list, i, PyLong_FromLong (output_buffer[i]));
+
+  free (output_buffer);
+
+  return output_list;
+}
+
+/* mlw_codec method descriptors.
+ */
+
+static PyMethodDef mlw_methods[] = {
+    {"decode", method_decode, METH_VARARGS, "Python interface for decode"},
+    {"encode", method_encode, METH_VARARGS, "Python interface for encode"},
+    {NULL, NULL, 0, NULL}
+};
+
+/* mlw_codec module descriptor.
+ */
+
+static struct PyModuleDef mlw_codecmodule = {
+    PyModuleDef_HEAD_INIT,
+    "mlw_codec",
+    "Python interface for the mlw encoder",
+    -1,
+    mlw_methods
+};
+
+PyMODINIT_FUNC PyInit_mlw_codec(void) {
+    return PyModule_Create(&mlw_codecmodule);
+}
diff --git a/ethosu/mlw_codec/mlw_common.h b/ethosu/mlw_codec/mlw_common.h
new file mode 100644
index 0000000..008473a
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_common.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_COMMON_H__
+#define __MLW_COMMON_H__
+
+#define ZDIV_DISABLE        6   // not alternating mode
+#define ZDIV_EOS            7   // indicates end of stream
+
+#define WDIV_UNCOMPRESSED   7   // indicates uncompressed weights
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_decode.c b/ethosu/mlw_codec/mlw_decode.c
new file mode 100644
index 0000000..92aaea6
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_decode.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <math.h>
+#include "mlw_common.h"
+#include "mlw_decode.h"
+
+
+/////////////////////////////// Read from bitstream
+
+typedef struct bitbuf {
+    uint8_t *buf;
+    int buf_size;               // in bytes
+    int pos;                    // bit pos of next bit
+    int log_symbols;
+} bitbuf_t;
+
+
+// size in byte
+static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols) {
+    bb->buf  = buf;
+    bb->pos  = 0;
+    bb->buf_size = size;
+    bb->log_symbols = log_symbols;
+}
+
+static int bitbuf_getbit( bitbuf_t *bb) {
+    int byte_pos = bb->pos>>3;
+    int bit_pos = bb->pos&7;
+    if ( byte_pos < 0 || byte_pos >= bb->buf_size ) {
+        printf("bitbuf_getbit: underrun, bit_pos %3d byte_pos %3d buf_size %3d\n", bit_pos, byte_pos, bb->buf_size);
+        exit(1);
+    }
+    int bit = bb->buf[ byte_pos ] & (1<<bit_pos) ? 1 : 0;
+    bb->pos++;
+    return bit;
+}
+
+static int bitbuf_get( bitbuf_t *bb, const char *name, int len) {
+    int i, data=0, save_pos=bb->pos;
+    if (len>0) {
+        for(i=0; i<len; i++) {
+            data |= bitbuf_getbit(bb)<<i;
+        }
+        if (bb->log_symbols)
+            printf("bitbuf: pos %3d %7s len %d data %x\n", save_pos, name, len, data);
+    }
+    return data;
+}
+
+// Decode the given weight stream
+//      inbuf       compressed bitstream
+//      inbuf_size  size of compressed bitstream in bytes
+//      outbuf      uncompressed 9bit signed weights, buffer malloced
+//      verbose     if non-zero, printf log
+// Return value is the number of uncompressed weights
+int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) {
+    int nvalues;
+    int w_grc_div;
+    int w_grc_trunc;
+    int w_uncompressed;
+    int z_grc_div, z_prev_grc_div=0;
+    int new_palette;
+    int palsize=0, palbits=0;
+    int direct_offset=0;
+    int16_t palette[512];
+    int first=1;
+    int use_zero_run, i, j;
+    int outbuf_size=0;
+    int nchunks=0;
+
+    *outbuf=0;
+
+    bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+    bitbuf_init( bb, inbuf, inbuf_size, (verbose&2)?1:0 );
+
+    // Loop over all slices
+    while(1) {
+        // Decode slice header
+        z_grc_div = bitbuf_get( bb, "ZDIV", 3 );
+        while(z_grc_div==ZDIV_EOS) {                    // TODO: change to ZDIV_PAD
+            // End of stream
+            // Byte align
+            bitbuf_get( bb, "BYTEALIGN", (8-(bb->pos&7))&7 );
+            first=1;
+            if ( (bb->pos/8) == inbuf_size) {
+                // Quit if we actually reached end of input stream
+                break;
+            }
+            z_grc_div = bitbuf_get( bb, "ZDIV", 3 );
+        }
+        if ( (bb->pos/8) == inbuf_size) {
+            break;  // reached end of input stream
+        }
+        assert(z_grc_div<4 || z_grc_div==ZDIV_DISABLE);
+        use_zero_run = z_grc_div!=ZDIV_DISABLE;    // alternating grc
+        nvalues = bitbuf_get( bb, "SLICELEN", 15 )+1;
+        w_grc_div = bitbuf_get( bb, "WDIV", 3 );
+        w_grc_trunc = bitbuf_get( bb, "WTRUNC", 1 );
+        new_palette = bitbuf_get( bb, "NEWPAL", 1 );
+        if (first) {
+            // the first slice must have a palette/direct mode setup
+            assert(new_palette);
+            first=0;
+        }
+        if (!new_palette) {
+            // At the moment it is not supported to change between alternating
+            // and non-alternating without redefining the palette (this is because
+            // the zero is not included in the palette in case of alternating)
+            int prev_use_zero_run = z_prev_grc_div!=ZDIV_DISABLE;
+            (void)(prev_use_zero_run);
+            assert( use_zero_run == prev_use_zero_run);
+        }
+        z_prev_grc_div = z_grc_div;
+        if (new_palette) {
+            direct_offset = bitbuf_get( bb, "DIROFS", 5 );
+            palsize = bitbuf_get( bb, "PALSIZE", 5 );
+            if (palsize>0)
+                palsize++;
+            palbits = bitbuf_get( bb, "PALBITS", 3 )+2;
+            for(i=0; i<palsize; i++) {
+                palette[i] = bitbuf_get( bb, "PALETTE", palbits );
+            }
+        }
+
+        if (w_grc_div==WDIV_UNCOMPRESSED) {
+            // Uncompressed mode
+            w_uncompressed = 1;
+            int uncompressed_bits;
+            if (palsize>0) {
+                // Uncompressed bits is given by palette size.
+                uncompressed_bits=0;
+                while( (1<<uncompressed_bits) < palsize )
+                    uncompressed_bits++;
+            } else {
+                // No palette. PALBITS is used to specify uncompressed bits.
+                uncompressed_bits=palbits;
+            }
+            // In uncompressed mode there's only a remainder part (no unary)
+            // This is achieved by setting w_grc_div to index bit width
+            w_grc_div = uncompressed_bits;
+        } else {
+            w_uncompressed = 0;
+            assert(w_grc_div<6);
+        }
+
+        // Decode the slice
+        int z_nvalues = nvalues + (new_palette?1:0);
+        int *w_value = malloc( nvalues*sizeof(int) );
+        int *z_value = malloc( z_nvalues*sizeof(int) );
+        int w_pos=0, z_pos=0;
+        int w_prev_pos=0, z_prev_pos=0;
+        int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q[12]={0}, w_carry=0;
+        int z_unary=0, z_q[12]={0}, z_carry=0;
+        int w_nsymbols=0;
+        int w_prev_enable=0, w_prev_nsymbols=0, w_prev_q[12]={0};
+        int z_nsymbols=0;
+        int z_prev_enable=0, z_prev_nsymbols=0, z_prev_q[12]={0};
+        int total_zcnt=0;
+        int z_unary_len = z_grc_div<3 ? 12 : 8;
+
+        // Loop over all chunks in the slice
+        do {
+            // Flow control to possibly throttle either the weights or zero-runs
+            int balance = use_zero_run ? w_pos - z_pos : 0;
+            int w_enable = (balance<8 || !use_zero_run) && w_pos<nvalues;
+            int z_enable = balance>=0 && use_zero_run && z_pos<z_nvalues;
+            if (w_enable) {
+                if (!w_uncompressed)
+                    w_unary0 = bitbuf_get( bb, "WUNARY0", 12 );
+                else
+                    w_unary0 = 0;
+            }
+            if (z_enable) {
+                z_unary = bitbuf_get( bb, "ZUNARY", z_unary_len );
+                z_nsymbols=0;
+                int cnt = z_carry;
+                for(i=0; i<z_unary_len; i++) {
+                    if (z_unary & (1<<i)) {
+                        cnt++;
+                    } else {
+                        z_q[z_nsymbols++] = cnt;
+                        cnt=0;
+                    }
+                }
+                z_carry = cnt;
+                z_pos += z_nsymbols;
+            }
+            if (w_enable) {
+                w_unary1_len=0;
+                int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
+                for(i=0; i<max_symbols; i++) {
+                    if (w_unary0&(1<<i))
+                        w_unary1_len++;
+                }
+                w_unary1 = bitbuf_get( bb, "WUNARY1", w_unary1_len );
+                w_nsymbols=0;
+                int cnt = w_carry;
+                for(i=0; i<max_symbols; i++) {
+                    int code=0;
+                    if (w_unary0 & (1<<i)) {
+                        code++;
+                        if (w_unary1&1) {
+                            code++;
+                        }
+                        w_unary1 = w_unary1>>1;
+                    }
+                    cnt += code;
+                    if (code<2 || w_grc_trunc) {
+                        w_q[w_nsymbols++] = cnt;
+                        cnt=0;
+                    }
+                }
+                w_carry = cnt;
+                w_pos += w_nsymbols;
+            }
+            if (w_prev_enable) {
+                for(i=0; i<w_prev_nsymbols && w_prev_pos<nvalues; i++, w_prev_pos++) {
+                    int remain = bitbuf_get( bb, "WREMAIN", w_grc_div );
+                    w_value[w_prev_pos] = (w_prev_q[i]<<w_grc_div) + remain;
+                }
+            }
+            if (z_prev_enable) {
+                for(i=0; i<z_prev_nsymbols && z_prev_pos<z_nvalues; i++, z_prev_pos++) {
+                    int remain = bitbuf_get( bb, "ZREMAIN", z_grc_div );
+                    z_value[z_prev_pos] = (z_prev_q[i]<<z_grc_div) + remain;
+                    total_zcnt += z_value[z_prev_pos];
+                }
+            }
+            w_prev_enable = w_enable;
+            w_prev_nsymbols = w_nsymbols;
+            memcpy( w_prev_q, w_q, sizeof(w_prev_q));
+            z_prev_enable = z_enable;
+            z_prev_nsymbols = z_nsymbols;
+            memcpy( z_prev_q, z_q, sizeof(z_prev_q));
+            nchunks++;
+        } while( w_prev_enable || z_prev_enable );
+
+        // Interleave non-zero and zeros into the outbut buffer
+        // Increase the outbuffer to fit the new slice
+        *outbuf = realloc( *outbuf, (outbuf_size + nvalues + total_zcnt)*sizeof(int16_t));
+
+        int k=outbuf_size;
+
+        // Insert initial zeros
+        // (slices redefining the palette may start with zeros)
+        if (new_palette && use_zero_run) {
+            for(j=0; j<z_value[0]; j++) {
+                (*outbuf)[k++] = 0;
+            }
+        }
+
+        // Loop over all weights and insert zeros in-between
+        for(i=0; i<nvalues; i++) {
+            int val;
+            assert(w_value[i]<512); // HW supports 9bit
+            if (w_value[i]<palsize) {
+                val = palette[w_value[i]];
+            } else {
+                val = w_value[i]-palsize+direct_offset;
+            }
+            int sign = val&1;
+            int mag  = val>>1;
+            (*outbuf)[k++] = sign ? -mag : mag;
+            if (use_zero_run) {
+                for(j=0; j<z_value[i+(new_palette?1:0)]; j++) {
+                    (*outbuf)[k++] = 0;
+                }
+            }
+        }
+
+        outbuf_size = k;
+        free(w_value);
+        free(z_value);
+    }
+    return outbuf_size;
+}
diff --git a/ethosu/mlw_codec/mlw_decode.h b/ethosu/mlw_codec/mlw_decode.h
new file mode 100644
index 0000000..a15261a
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_decode.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_DECODE_H__
+#define __MLW_DECODE_H__
+
+#ifdef _MSC_VER
+  #define EXPORTED __declspec(dllexport)
+#else
+  #define EXPORTED __attribute__((visibility("default")))
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+EXPORTED
+int mlw_decode(uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose);
+
+#if __cplusplus
+}
+#endif
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_encode.c b/ethosu/mlw_codec/mlw_encode.c
new file mode 100644
index 0000000..ac25fc5
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_encode.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <math.h>
+#include "mlw_common.h"
+#include "mlw_encode.h"
+
+#define DPRINTF(...)
+//#define DPRINTF(...) printf(__VA_ARGS__)
+
+#define ZERO_RUN_THRES  4
+
+#define min(a,b) ((a)<(b)?(a):(b))
+#define max(a,b) ((a)>(b)?(a):(b))
+
+typedef struct palette {
+    int16_t lut[32];
+    int16_t inv_lut[512];
+    int palsize;    // number of palette entries
+    int palbits;    // bit width of palette entries
+    int use_zero_runs;    // zeros are coded separately
+    int only_palette;   // no values outside the palette
+    int direct_offset;  // added to the decoded weight index before direct conversion to sign/mag
+    int only_zeros;     // special case that the section is all zeros
+} palette_t;
+
+static int is_power_of_two( int x ) {
+    return ((x-1) & x)==0;
+}
+
+static int get_palette_index_bits( int size ) {
+    int i;
+    for(i=7; i>=0; i--)
+        if (size > (1<<i) )
+            return i+1;
+    return 0;
+}
+
+// Search the stream for suitable palette restart positions
+// Return the number of restarts
+static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
+    int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
+    int prev_idx[512];  // For each value, keep track of the index of the previous occurence
+    int *restart_pos;
+    int max_palettes = size/64;
+
+    // Preliminary allocation of sufficient size
+    restart_pos = (int*)malloc( max_palettes*sizeof(int) );
+    last_restart_idx=0;
+    got_palette=0;
+    restart_i=1;
+    restart_pos[0] = 0;
+    zero_cnt=0;
+    memset( prev_idx, -1, sizeof(prev_idx));
+    for(i=0; i<size; i++) {
+        // Guess if zeros should be excluded from the palette
+        int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
+
+        if (got_palette) {
+            // Check if the next value is not covered by the current palette
+            if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
+                // New value: increase the palette size
+                palette_size++;
+                DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
+                if ( is_power_of_two(palette_size-1-exclude_zero) ) {
+                    if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
+                        // create a new palette because we extend a long lasting palette to require one more index bit
+                        DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
+                        assert( restart_i < max_palettes );
+                        DPRINTF("restart %d pos %d\n", restart_i, i);
+                        restart_pos[restart_i++] = i;
+                        last_restart_idx = i;
+                        got_palette=0;
+                        zero_cnt=0;
+                    }
+                }
+            }
+        }
+
+        prev_idx[ buf[i]+256 ] = i;
+        if (buf[i]==0)
+            zero_cnt++;
+
+        static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
+        int k;
+        // loop over window sizes
+        for(k=0; k<5; k++) {
+            // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
+            int N = window_sizes[k][0] * (got_palette?2:1);
+            if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
+                // Search backward to the position N nonzero values earlier
+                int nzcnt=0;
+                for( j=i; j>last_restart_idx; j--) {
+                    if ( buf[j]!=0 ) {
+                        if (nzcnt==N+1)
+                            break;
+                        nzcnt++;
+                    }
+                }
+                int restart_idx = j;
+
+                // Calculate the size of a new palette (starting at restart_idx)
+                int new_palette_size=0;
+                for(j=0; j<512; j++) {
+                    if ( prev_idx[j] >= restart_idx ) {
+                        new_palette_size++;
+                    }
+                }
+
+                int create_new_palette=0;
+                if (got_palette) {
+                    int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+                    int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
+                    int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
+                    if ( savings>0 ) {
+                        // Create new palette because it can be smaller than the existing palette
+                        create_new_palette=1;
+                        DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
+                    }
+                } else {
+                    if ( (new_palette_size-exclude_zero) <= 32) {
+                        int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+                        // estimate if we will make savings by using palette mode
+                        int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
+                        create_new_palette = savings>0;
+                    }
+                }
+                if (create_new_palette) {
+                    palette_size=new_palette_size;
+                    got_palette=1;
+                    last_restart_idx = restart_idx;
+                    DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
+                    if ( restart_pos[restart_i-1] != last_restart_idx) {
+                        assert( restart_i < max_palettes );
+                        restart_pos[restart_i++] = last_restart_idx;
+                    }
+                    zero_cnt=0;
+                    for( j=last_restart_idx; j<=i; j++)
+                        if (buf[j]==0)
+                            zero_cnt++;
+                }
+            }
+        }
+    }
+    // Reallocate to actual size
+    *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
+    return restart_i;
+}
+
+// Calculate frequency table
+static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
+    int i;
+    memset(freq, 0, 512*sizeof(int));
+    for(i=0; i<size; i++) {
+        freq[buf[i]+256]++;
+    }
+}
+
+static int cmp_uint64(const void * a, const void * b) {
+   uint64_t aa = *(uint64_t*)a;
+   uint64_t bb = *(uint64_t*)b;
+   return  aa>bb ? -1 : aa<bb ? 1 : 0;
+}
+
+// Create palette from the given frequencies
+// Freq index 0-511 correspond to weights -256..255
+static void create_palette( int freq[512],
+                           int use_zero_runs,
+                           palette_t *p ) {
+    uint64_t freq64[512];
+    int i,all_cnt,all_max_val;
+
+    // Pair the frequency with the value so that
+    // the array can be sorted on frequency while keeping
+    // track of the corresponding palette value
+    memset(freq64, 0, sizeof(freq64));
+    all_cnt=0;
+    all_max_val=0;
+    for(i=-255; i<256; i++) {
+        if (i==0 && use_zero_runs)
+            continue;
+        int sign = i<0;
+        int mag = abs(i);
+        int palval = (mag<<1) | sign;
+
+        // Store palette value in 16 LSB bits, which will not affect the sorting
+        freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
+        all_cnt+=freq[i+256];
+
+        if (freq[i+256]>0) {
+          all_max_val = max(all_max_val, palval);
+        }
+    }
+
+    // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
+    for(i=0; i<31; i++) {
+        if ((freq64[i]>>16)!=0)
+            break;
+    }
+    p->direct_offset = i;
+
+    // Sort in descending frequency order
+    qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
+
+    // Identify special case that there are no weights to code
+    // in the weight index stream (i.e. all weights are zeros)
+    p->only_zeros = (freq64[0]>>16)==0;
+    if (p->only_zeros) {
+        p->direct_offset=0;
+    }
+
+    // Check if all weights fit into the palette (and the palette is not empty)
+    p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
+
+    int max_palette_size;
+    if (p->only_palette) {
+        max_palette_size = 32;
+    } else {
+        // For direct-lut we must make sure that the encoded weight
+        // index is not > 511. We do that by limiting the palette size
+        // such that the greatest value can be reached after subtracting
+        // the palette size.
+        max_palette_size = min(32, 511-all_max_val);
+        if (max_palette_size==1) {
+            max_palette_size=0; // because palette of size 1 is not supported
+        }
+    }
+
+    // Setup the 32 entry palette
+    int palette_max_val = 0, val, cnt, pal_cnt=0;
+    for(i=0; i<max_palette_size; i++) {
+        cnt = freq64[i]>>16;
+        val = freq64[i]&0xffff;
+        if ( cnt==0 )
+            break;
+        p->lut[i] = val;
+        palette_max_val = max(palette_max_val, val);
+        pal_cnt+=cnt;
+    }
+    if (i==1)
+        i++;    // palette size of 1 is not supported, make it 2
+
+    // Heuristic for when to use the palette. If more than half of the
+    // weights are in the palette then we use it. This ensures we don't
+    // use palette for e.g. rectangular distributions.
+    int palbits_val;
+    if (pal_cnt > all_cnt/2) {
+        p->palsize  =  i;
+        palbits_val = palette_max_val;
+    } else {
+        // No palette
+        p->palsize  =  0;
+        // If no palette, then palbits is used to specify the
+        // number of bits required for uncompressed mode, i.e.
+        // the number of bits for the greatest weight value
+        palbits_val = all_max_val;
+    }
+
+    // the palette entry bit width
+    // minimum 2bits (because PALBITS is in range 2..9)
+    int palbits=2;
+    while( (1<<palbits) <= palbits_val )
+        palbits++;
+    assert(palbits<=9);
+    p->palbits  = palbits;
+    p->use_zero_runs  = use_zero_runs;
+}
+
+// Return 1 if zero runs should be used
+// If palette_size is 512, then palette is not used (in that case the palette is setup
+// with the standard alternating unsigned to signed mapping)
+static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
+    int freq[512], i;
+
+    // Calculate frequencies of the given weight stream
+    calc_freq( inbuf, inbuf_size, freq);
+
+    // Find two most common values
+    int most_common_freq[2]={0}, most_common_val[2]={0};
+    for(i=0; i<512; i++) {
+        if ( freq[i] > most_common_freq[0] ) {
+            most_common_freq[1] = most_common_freq[0];
+            most_common_val[1]  = most_common_val[0];
+            most_common_freq[0] = freq[i];
+            most_common_val[0]  = i-256;
+        } else if ( freq[i] > most_common_freq[1] ) {
+            most_common_freq[1] = freq[i];
+            most_common_val[1]  = i-256;
+        }
+    }
+
+    // Decide if zero-runs (alternating mode) should be used:
+    // * zero should be the most common symbol
+    // * zero should be sufficiently more common than the second most common symbol
+    int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
+
+    // Create the palette
+    create_palette( freq, use_zero_runs, p);
+
+    return use_zero_runs;
+}
+
+static void create_inverse_palette( palette_t *p) {
+    int i;
+    memset( p->inv_lut, 0, sizeof(p->inv_lut));
+    for(i=0; i<512; i++) {
+        int val  = i;
+        int sign = val&1;
+        int mag  = val>>1;
+        int weight = sign ? -mag : mag;
+        if (weight+256 < 512)
+            p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset;
+    }
+    for(i=0; i<p->palsize; i++) {
+        int val = p->lut[i];
+        int sign = val&1;
+        int mag  = val>>1;
+        int weight = sign ? -mag : mag;
+        if (weight+256 < 512)
+            p->inv_lut[ weight+256 ] = i;
+    }
+}
+
+#define NWCFG 13
+#define NZCFG 4 // restrict search to ZDIV=0..3
+#define MAX_ZWCFG (max(NWCFG,NZCFG))
+
+// search state
+typedef struct search_state {
+    int bitcnt;             // number of bits to reach this state
+    uint8_t prev_cfg;       // previous grc parameter config
+} search_state_t;
+
+// (trunc<<4) | div, 0x20 means uncompressed
+static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
+static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
+
+
+
+// An algorithm similar to the Viterbi algorithm is used to search for a
+// good GRC parameter sequence for the given input value sequence.
+// The inval buffer can contain weights, weight indices or runs.
+// The return value is the resulting number of bitstream sections.
+static int search_grc_params( const int *inval_buf,
+                              int n_inval,
+                              int zrun_mode,
+                              int uncompressed_bits,
+                              uint8_t *grc_param_cfg,
+                              int *grc_param_pos,
+                              int max_grc_param_cfg,
+                              int *existing_grc_param_pos,
+                              int n_existing_grc_param_pos,
+                              int *bitcnt )
+{
+    int n_cfg = zrun_mode ? NZCFG : NWCFG;
+    const char *grc_params = zrun_mode ? z_grc_params : w_grc_params;
+    int i,j;
+
+    search_state_t *state[MAX_ZWCFG];
+    for(i=0; i<n_cfg; i++) {
+        state[i] = malloc( sizeof(search_state_t) * (n_inval+1) );
+        state[i][0].bitcnt=0;
+        state[i][0].prev_cfg=i;
+    }
+
+    // Loop over inval_buf
+    int existing_idx=0;
+    for(i=0; i<n_inval; i++) {
+        int value = inval_buf[i];
+
+        // Best GRC parameter so far
+        int best_bitcnt=0x7fffffff, best_cfg=0;
+        for(j=0; j<n_cfg; j++) {
+            if (state[j][i].bitcnt < best_bitcnt) {
+                best_bitcnt = state[j][i].bitcnt;
+                best_cfg = j;
+            }
+        }
+
+        int cmd_cost = 40;
+        if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
+            // free transition, because the weight stream already inserted a command at this position
+            cmd_cost = 0;
+            existing_idx++;
+        }
+
+        // Loop over GRC parameters, calculate bits to code value, and then update the search state
+        for(j=0; j<n_cfg; j++) {
+            int div = grc_params[j]&15;
+            int trunc = grc_params[j]>>4;
+            int q = value>>div;
+            int bits = trunc ? min(q+1,2) + div : q+1+div;
+            if (!zrun_mode && ((trunc && q>2) || q>31))
+                bits=10000;  // it's not possible to code the current value; give it a high cost
+            if (trunc==2)
+                bits=uncompressed_bits;
+
+            if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
+                // Change GRC parameters
+                state[j][i+1].prev_cfg  = best_cfg;
+                state[j][i+1].bitcnt    = best_bitcnt + cmd_cost + bits;
+            } else {
+                // Keep same GRC parameters
+                state[j][i+1].prev_cfg  = j;
+                state[j][i+1].bitcnt    = state[j][i].bitcnt + bits;
+            }
+        }
+    }
+
+
+    // Best GRC parameter
+    int best_bitcnt=0x7fffffff, best_cfg=0;
+    for(j=0; j<n_cfg; j++) {
+        if (state[j][n_inval].bitcnt < best_bitcnt) {
+            best_bitcnt = state[j][n_inval].bitcnt;
+            best_cfg = j;
+        }
+    }
+
+    int cfg = best_cfg;
+    int n_cmds=0;
+    for(i=n_inval; i>=0; i--) {
+        if (state[cfg][i].prev_cfg != cfg || i==0) {
+            n_cmds++;
+            cfg = state[cfg][i].prev_cfg;
+        }
+    }
+
+    (void)(max_grc_param_cfg);
+    assert(n_cmds<=max_grc_param_cfg);
+
+    cfg = best_cfg;
+    j=n_cmds-1;
+    int endpos=n_inval;
+    for(i=n_inval; i>=0; i--) {
+        if (state[cfg][i].prev_cfg != cfg || i==0) {
+            grc_param_cfg[j] = cfg;
+            grc_param_pos[j] = endpos;
+            j--;
+            cfg = state[cfg][i].prev_cfg;
+            endpos = i-1;
+        }
+    }
+    assert(j==-1);
+
+    for(i=0; i<n_cfg; i++) {
+        free(state[i]);
+    }
+
+    *bitcnt = best_bitcnt;
+
+    return n_cmds;
+}
+
+
+/////////////////////////////// Write to bitstream
+
+typedef struct bitbuf {
+    uint8_t *buf;
+    int buf_size;               // in bytes
+    int pos;                    // bit pos of next bit
+    int log_symbols;
+} bitbuf_t;
+
+// size in byte
+static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
+    bb->buf  = buf;
+    bb->pos  = 0;
+    bb->buf_size = size;
+    bb->log_symbols = log_symbols;
+}
+
+static void bitbuf_putbit( bitbuf_t *bb, int bit) {
+    int byte_pos = bb->pos>>3;
+    int bit_pos = bb->pos&7;
+    assert( byte_pos >= 0 );
+    assert( byte_pos < bb->buf_size );
+    bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<<bit_pos)) | (bit<<bit_pos);
+    bb->pos += 1;
+}
+
+static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
+    int i;
+    if (len>0) {
+        if (bb->log_symbols)
+            printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
+        for(i=0; i<len; i++) {
+            bitbuf_putbit(bb, (data>>i)&1);
+        }
+    }
+}
+
+// Return new bitpos
+static int encode_slice( const int *w_value,
+                         const int *z_value,
+                         int nvalues,
+                         palette_t *p,
+                         int new_palette,
+                         int uncompressed_bits,
+                         int w_cfg,
+                         int z_cfg,
+                         uint8_t *bitbuf,
+                         int bitbuf_size,
+                         int bitpos,
+                         int verbose )
+{
+    int i,j;
+    bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+    bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
+    bb->pos = bitpos;
+
+    assert(nvalues<32768);
+    // GRC parameters for this slice
+    int w_grc_div       = w_grc_params[w_cfg] & 15;
+    int w_grc_trunc     = (w_grc_params[w_cfg] >> 4)==1;
+    int w_uncompressed  = (w_grc_params[w_cfg] >> 4)==2;
+    int z_grc_div       = z_grc_params[z_cfg] & 15;
+
+    if (w_uncompressed) {
+        w_grc_div = uncompressed_bits;
+    }
+
+    int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
+    int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
+
+    if (verbose&1) {
+        printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
+                bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
+    }
+
+    // Write slice header
+    bitbuf_put( bb, "ZDIV", 3, zdiv);
+    bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
+    bitbuf_put( bb, "WDIV", 3, wdiv);
+    bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
+    bitbuf_put( bb, "NEWPAL", 1, new_palette );
+    if (new_palette) {
+        bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
+        bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
+        bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
+        for(i=0; i<p->palsize; i++) {
+            bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
+        }
+    }
+
+    int z_nvalues = nvalues + (new_palette?1:0);
+    int w_pos=0, z_pos=0;
+    int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
+    int z_unary=0, z_q=-1, z_r=0;
+    int w_nsymbols=0, w_remain[12]={0};
+    int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
+    int z_nsymbols=0, z_remain[12]={0};
+    int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
+    int z_unary_len = z_grc_div<3 ? 12 : 8;
+    do {
+        int balance = p->use_zero_runs ? w_pos - z_pos : 0;
+        int w_enable = balance<8 && w_pos<nvalues;
+        int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
+        if (w_enable) {
+            // Encode chunk (weights)
+            j=0;
+            w_nsymbols=0;
+            w_unary0=0;
+            w_unary1=0;
+            w_unary1_len=0;
+            int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
+            while(j<max_symbols) {
+                if (w_q<0) {
+                    if (w_pos<nvalues) {
+                        int value = w_value[w_pos];
+                        assert(value<512);
+                        w_q = value>>w_grc_div;
+                        w_r = value&((1<<w_grc_div)-1);
+                        assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
+                    } else {
+                        w_q = 0;
+                        w_r = -1;   // don't send remainder
+                    }
+                }
+                while( w_q>=0 && j<max_symbols) {
+                    w_unary0 |= w_q>0 ? (1<<j) : 0;
+                    if (w_q>0) {
+                        w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
+                        w_unary1_len++;
+                    }
+                    j++;
+                    w_q-=2;
+                    if (w_grc_trunc)
+                        w_q--;
+                }
+                if (w_q<0 && w_r>=0) {
+                    w_remain[w_nsymbols] = w_r;
+                    w_nsymbols++;
+                    w_pos++;
+                }
+            }
+        }
+
+        if (z_enable) {
+            // Encode chunk (zrun)
+            j=0;
+            z_nsymbols=0;
+            z_unary=0;
+            while(j<z_unary_len) {
+                if (z_q<0) {
+                    if (z_pos<z_nvalues) {
+                        int value = z_value[z_pos];
+                        z_q = value>>z_grc_div;
+                        z_r = value&((1<<z_grc_div)-1);
+                    } else {
+                        z_q = 0;
+                        z_r = -1;
+                    }
+                }
+                while( z_q>=0 && j<z_unary_len) {
+                    z_unary |= z_q>0 ? (1<<j) : 0;
+                    j++;
+                    z_q--;
+                }
+                if (z_q<0 && z_r>=0) {
+                    z_remain[z_nsymbols] = z_r;
+                    z_nsymbols++;
+                    z_pos++;
+                }
+            }
+        }
+
+        // Write chunk to bitstream
+        if (w_enable && !w_uncompressed) {
+            bitbuf_put( bb, "WUNARY0", 12, w_unary0);
+        }
+        if (z_enable) {
+            bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
+        }
+        if (w_enable && !w_uncompressed) {
+            bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
+        }
+        if (w_prev_enable) {
+            for(i=0; i<w_prev_nsymbols; i++) {
+                bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
+            }
+        }
+        if (z_prev_enable) {
+            for(i=0; i<z_prev_nsymbols; i++) {
+                bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
+            }
+        }
+        w_prev_enable = w_enable;
+        w_prev_nsymbols = w_nsymbols;
+        memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
+        z_prev_enable = z_enable;
+        z_prev_nsymbols = z_nsymbols;
+        memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
+    } while( w_prev_enable || z_prev_enable );
+
+    return bb->pos;
+}
+
+
+// return new bitpos
+static int encode_section( const int16_t *inbuf,
+                           int size,
+                           palette_t *p,
+                           uint8_t *bitbuf,
+                           int bitbuf_size,
+                           int bitpos,
+                           int verbose )
+{
+    int uncompressed_bits;
+
+    // Uncompressed mode can only be used if either all weights
+    // are in the palette OR if the palette is not used.
+    if (p->only_palette) {
+        // Uncompressed bits derived from palette size
+        uncompressed_bits=0;
+        while( (1<<uncompressed_bits) < p->palsize )
+            uncompressed_bits++;
+    } else if (p->palsize==0) {
+        // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
+        uncompressed_bits = p->palbits;
+    } else {
+        // Don't use uncompressed
+        uncompressed_bits = 100;
+    }
+
+    int *weight_values = malloc( size*sizeof(int) );
+    int *zrun_values = malloc( size*sizeof(int) );
+
+    // Get weights (or weight indicies) AND zero-runs from the input weight stream.
+    int i=0, n_weights = 0, zcnt;
+    while(1) {
+        if (p->use_zero_runs) {
+            zcnt=0;
+            // Count zero run
+            // Special case: if all weights in the section are zero, we must
+            // still ensure we have one coded weight so the the slice length
+            // doesn't become 0. Therefore we skip the first zero run and code
+            // the zero explicitly as a weight value instead
+            if (!p->only_zeros || i>0) {
+                while( i<size && inbuf[i]==0) {
+                    zcnt++;
+                    i++;
+                }
+            }
+            zrun_values[n_weights] = zcnt;
+        }
+        if (i==size)
+            break;
+        int value = p->inv_lut[inbuf[i]+256];
+        weight_values[n_weights] = value;
+        n_weights++;
+        i++;
+    }
+
+    // Search for good GRC parameters for the weight stream
+    int n_w_slice, w_bitcnt;
+    uint8_t *w_slice_cfg;
+    int *w_slice_pos;
+    w_slice_cfg = malloc( size );
+    w_slice_pos = malloc( size*sizeof(int) );
+    n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
+    if (n_weights==0)
+        n_w_slice = 0;
+
+    // Search for good GRC parameters for the zrun stream
+    int n_z_slice=0, z_bitcnt=0;
+    uint8_t *z_slice_cfg=0;
+    int *z_slice_pos=0;
+    if (p->use_zero_runs) {
+        z_slice_cfg = malloc( size );
+        z_slice_pos = malloc( size*sizeof(int) );
+        n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
+    }
+
+    // Encode bitstream slice
+    int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
+    while(pos<n_weights || new_palette) {
+        int endpos=pos+32767;   // max slice length
+
+        if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
+            endpos = w_slice_pos[i_w_slice];
+        }
+
+        if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
+            endpos = z_slice_pos[i_z_slice];
+        }
+
+        if (n_weights < endpos) {
+            endpos = n_weights;
+        }
+
+        // The first slice (when new_palette is 1) encodes zero runs both at the
+        // beginning and end (i.e. number of zero runs are len+1).
+        // The following slices only encode zero runs at the end (there cannot be
+        // any zeros in the beginning since they are encoded by the previous slice)
+        int len = endpos - pos;
+        int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
+        bitpos = encode_slice( weight_values+pos, zrun_buf, len,
+                               p, new_palette, uncompressed_bits,
+                               w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
+                               bitbuf, bitbuf_size, bitpos, verbose );
+        new_palette = 0;
+
+        if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
+            i_w_slice++;
+        }
+        if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
+            i_z_slice++;
+        }
+        pos = endpos;
+    }
+
+    // Free temporary buffers
+    free(w_slice_cfg);
+    free(w_slice_pos);
+    if (p->use_zero_runs) {
+        free(z_slice_cfg);
+        free(z_slice_pos);
+    }
+    free(weight_values);
+    free(zrun_values);
+
+    return bitpos;
+}
+
+// Encode the given weight stream
+//      inbuf       uncompressed 9bit signed weights
+//      inbuf_size  number of weights
+//      outbuf      compressed bitstream, buffer is malloced
+//      verbose     if non-zero, printf log
+// Return value is the size in bytes of the compressed output
+// Return -1 if error
+int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
+    int i;
+    // Range check
+    for(i=0; i<inbuf_size; i++) {
+        if (inbuf[i]<-255 || inbuf[i]>255) {
+            printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
+            return -1;
+        }
+    }
+
+    int bitbuf_size = inbuf_size*2+1024;
+    *outbuf = malloc( bitbuf_size );
+
+    // Analyse input data to find palette re-programming points
+    int n_restarts;
+    int *palette_restart_pos;
+    n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
+
+    // Compress each section (using a single palette) separately
+    int bitpos=0;
+    for(i=0; i<n_restarts; i++) {
+        palette_t palette;
+        int pos, size;
+        pos = palette_restart_pos[i];
+        size   = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
+        find_palette( inbuf+pos, size, &palette);
+        create_inverse_palette( &palette);
+        bitpos = encode_section( inbuf+pos, size, &palette,
+                                 *outbuf, bitbuf_size, bitpos, verbose );
+    }
+
+
+    // Add end of stream marker and align to 128bit
+    {
+        bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+        bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
+        bb->pos = bitpos;
+        bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
+        bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
+
+        // Pad with 0xff until 64bit aligned
+        while( bb->pos & 127 ) {
+          bitbuf_put( bb, "PAD", 8, 0xff );
+        }
+        bitpos = bb->pos;
+    }
+    assert((bitpos&127)==0);
+    int outbuf_size = bitpos/8;
+    *outbuf = realloc( *outbuf, outbuf_size);
+
+    free(palette_restart_pos);
+
+    return outbuf_size;
+}
+
+void mlw_free_outbuf( uint8_t *outbuf ) {
+    if (outbuf)
+        free(outbuf);
+}
diff --git a/ethosu/mlw_codec/mlw_encode.h b/ethosu/mlw_codec/mlw_encode.h
new file mode 100644
index 0000000..a995ac6
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_encode.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_ENCODE_H__
+#define __MLW_ENCODE_H__
+
+#ifdef _MSC_VER
+  #define EXPORTED __declspec(dllexport)
+#else
+  #define EXPORTED __attribute__((visibility("default")))
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+EXPORTED
+int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose);
+
+EXPORTED
+void mlw_free_outbuf(uint8_t *outbuf);
+
+#if __cplusplus
+}
+#endif
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_main.c b/ethosu/mlw_codec/mlw_main.c
new file mode 100644
index 0000000..9f72049
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_main.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include "mlw_encode.h"
+#include "mlw_decode.h"
+
+static void fatal_error(const char *format, ...) {
+  va_list ap;
+  va_start (ap, format);
+  vfprintf(stderr, format, ap);
+  va_end(ap);
+  exit(1);
+}
+
+static void print_usage(void) {
+    printf("Usage:\n");
+    printf("    Encode: ./mlw_codec [<options>] [-o <outfile.mlw>] infiles.bin\n");
+    printf("    Decode: ./mlw_codec [<options>] -d [-o <outfile.bin>] infiles.mlw\n");
+    printf("\n");
+    printf("Options:\n");
+    printf("    -w      The uncompressed weight file is an int16_t (word) stream.\n");
+    printf("            This is to support 9bit signed weights. Little endian is assuemd.\n");
+    printf("            The default format is int8_t (byte) stream (if -w is not specified)\n");
+    printf("\n");
+}
+
+// Read file into allocated buffer. Return length in bytes.
+static int read_file( FILE *f, uint8_t **buf) {
+
+    fseek(f, 0, SEEK_END);
+    int size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    *buf = malloc(size);
+    assert(*buf);
+    int rsize = fread(*buf, 1, size, f);
+    assert(rsize==size);
+    fclose(f);
+    return size;
+}
+
+
+#define MAX_INFILES 1000
+
+int main(int argc, char *argv[])
+{
+    int c, decode=0, inbuf_size, outbuf_size;
+    char *infile_name[MAX_INFILES], *outfile_name=0;
+    uint8_t *inbuf=0, *outbuf=0;
+    FILE *infile, *outfile=0;
+    int verbose=0, infile_idx=0;
+    int int16_format=0;
+
+    if (argc==1) {
+        print_usage();
+        exit(1);
+    }
+
+    // Parse command line options
+    while( optind < argc) {
+        // Parse options
+        while ((c = getopt (argc, argv, "di:o:v:w?")) != -1) {
+            switch (c) {
+            case 'd':
+                decode=1;
+                break;
+            case 'i':
+                assert(infile_idx<MAX_INFILES);
+                infile_name[infile_idx++]=optarg;
+                break;
+            case 'o':
+                outfile_name=optarg;
+                break;
+            case 'v':
+                verbose=atoi(optarg);
+                break;
+            case 'w':
+                int16_format=1;
+                break;
+            case '?':
+                print_usage();
+                exit(0);
+            }
+        }
+
+        if (optind<argc) {
+            assert(infile_idx<MAX_INFILES);
+            infile_name[infile_idx++]=argv[optind];
+            optind++;
+
+        }
+    }
+
+    if (outfile_name) {
+        outfile=fopen(outfile_name, "wb");
+        if (!outfile)
+            fatal_error("ERROR: cannot open outfile %s\n", outfile_name);
+    }
+
+    // Loop over input files
+    int nbr_of_infiles=infile_idx;
+    for(infile_idx=0; infile_idx<nbr_of_infiles; infile_idx++) {
+        infile=fopen(infile_name[infile_idx], "rb");
+        if (!infile)
+            fatal_error("ERROR: cannot open infile %s\n", infile_name[infile_idx]);
+
+        // Read infile to buffer
+        inbuf_size = read_file(infile, &inbuf);
+
+        if (!decode) {
+            // Encode
+            int i, n = int16_format ? inbuf_size/sizeof(int16_t) : inbuf_size;
+            int16_t *weights = malloc( n * sizeof(int16_t) );
+            for(i=0; i<n; i++) {
+                weights[i] = int16_format ? ((int16_t*)inbuf)[i] : ((int8_t*)inbuf)[i];
+            }
+            outbuf_size = mlw_encode( weights, n, &outbuf, verbose);
+            free(weights);
+            printf("Input size %d output size %d bpw %4.2f\n", n, outbuf_size, outbuf_size*8.0/n);
+        } else {
+            // Decode
+            int i, n;
+            int16_t *weights;
+            n = mlw_decode( inbuf, inbuf_size, &weights, verbose);
+            outbuf_size = int16_format ? n*sizeof(int16_t) : n;
+            outbuf = malloc( outbuf_size );
+            assert(outbuf);
+            for(i=0; i<n; i++) {
+                if (int16_format)
+                    ((int16_t*)outbuf)[i] = weights[i];
+                else
+                    outbuf[i] = weights[i];
+            }
+            free(weights);
+            printf("Input size %d output size %d bpw %4.2f\n", inbuf_size, n, inbuf_size*8.0/n);
+
+        }
+
+        if (outfile) {
+            fwrite(outbuf, 1, outbuf_size, outfile);
+        }
+
+        if (inbuf)
+            free(inbuf);
+        if (outbuf)
+            free(outbuf);
+    }
+
+    if (outfile) {
+        fclose(outfile);
+    }
+
+    return 0;
+}
diff --git a/ethosu/mlw_codec/test_mlw_codec.py b/ethosu/mlw_codec/test_mlw_codec.py
new file mode 100644
index 0000000..b868721
--- /dev/null
+++ b/ethosu/mlw_codec/test_mlw_codec.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Simple example of the usage of mlw_codec.
+
+import sys
+
+from ethosu import mlw_codec
+
+
+# Simple example
+if __name__ == "__main__":
+    weights = [0, 2, 3, 0, -1, -2, -3, 0, 0, 0, 1, -250, 240] * 3
+    print("Original weights    :", weights)
+
+    compressed_weights = mlw_codec.encode(weights)
+    print("Compressed weights  :", len(compressed_weights), compressed_weights)
+
+    uncompressed_weights = mlw_codec.decode(compressed_weights)
+    print("Uncompressed weights:", uncompressed_weights)
+
+    if weights != uncompressed_weights:
+        print("TEST FAILED")
+        sys.exit(1)
+    else:
+        print("TEST PASSED")
+        sys.exit(0)
diff --git a/ethosu/vela/__init__.py b/ethosu/vela/__init__.py
new file mode 100644
index 0000000..07d8d79
--- /dev/null
+++ b/ethosu/vela/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._version import __version__
+from .vela import main
+
+__all__ = [main, __version__]
diff --git a/ethosu/vela/__main__.py b/ethosu/vela/__main__.py
new file mode 100644
index 0000000..9bf74c7
--- /dev/null
+++ b/ethosu/vela/__main__.py
@@ -0,0 +1,22 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from .vela import main
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/ethosu/vela/_version.py b/ethosu/vela/_version.py
new file mode 100644
index 0000000..f3888c3
--- /dev/null
+++ b/ethosu/vela/_version.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pkg_resources
+
+__version__ = pkg_resources.get_distribution("ethos-u-vela").version
\ No newline at end of file
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
new file mode 100644
index 0000000..4a03d0e
--- /dev/null
+++ b/ethosu/vela/architecture_features.py
@@ -0,0 +1,618 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Holds a container for Ethos-U55/System architecture parameters.
+
+from .nn_graph import MemArea, TensorPurpose, NpuBlockType, TensorFormat
+from .numeric_util import round_up, round_up_divide
+from collections import namedtuple
+from configparser import ConfigParser
+from .supported_operators import SupportedOperators
+import numpy as np
+import enum
+
+PointXY = namedtuple("PointXY", "x y")
+PointXYZ = namedtuple("PointXYZ", "x y z")
+
+
+class Block:
+    def __init__(self, w, h, d):
+        self.width = w
+        self.height = h
+        self.depth = d
+
+    def __eq__(self, other):
+        if self.width == other.width and self.height == other.height and self.depth == other.depth:
+            return True
+        else:
+            return False
+
+    def __repr__(self):
+        return "<Block: {0},{1},{2}>".format(self.width, self.height, self.depth)
+
+    @classmethod
+    def from_string(cls, s):
+        w, h, c = (int(v) for v in s.split("x"))
+        return cls(w, h, c)
+
+
+class Rect:
+    def __init__(self, x, y, z, x2, y2, z2):
+        self.x = x
+        self.y = y
+        self.z = z
+        self.x2 = x2
+        self.y2 = y2
+        self.z2 = z2
+
+    def start(self):
+        return PointXYZ(self.x, self.y, self.z)
+
+    def end(self):
+        return PointXYZ(self.x2, self.y2, self.z2)
+
+    def size(self):
+        return Block(self.x2 - self.x + 1, self.y2 - self.y + 1, self.z2 - self.z + 1)
+
+    def __repr__(self):
+        return "<Rect: ({0},{1},{2}) ({3},{4},{5})>".format(self.x, self.y, self.z, self.x2, self.y2, self.z2)
+
+
+class Kernel:
+    def __init__(self, w, h, sx=1, sy=1, dx=1, dy=1):
+        assert sx > 0 and sy > 0
+        assert dx > 0 and dy > 0
+        self.width = w
+        self.height = h
+        self.stride = PointXY(sx, sy)
+        self.dilation = PointXY(dx, dy)
+
+
+class SHRAMElements:
+    IFM8 = 0
+    IFM16 = 1
+    IFM8_Elementwise = 2
+    IFM16_Elementwise = 3
+    Acc16 = 4
+    Acc32 = 5
+    Acc40 = 6
+    Last = Acc40
+    BitSizes = np.array([8, 16, 8, 16, 16, 32, 40], np.int32)
+
+
+class SHRAMBlockConfig:
+    def __init__(self, sizes, banks):
+        assert len(banks) == SHRAMElements.Last + 1
+        self.sizes = sizes
+        self.banks = banks
+
+
+# Area indices must match Ethos-U55 SHRAM layout spec
+class SharedBufferArea(enum.IntEnum):
+    OFM = 0
+    Weights = 1
+    IFM = 2
+    Accumulators = 3
+    Size = Accumulators + 1
+
+
+class ArchitectureFeatures:
+    """This class is a container for various parameters of the Ethos-U55 core
+and system configuration that can be tuned, either by command line
+parameters or by the Ethos-U55 architects. The class is often passed
+around to passes that need to do architecture-dependent actions.
+
+Note the difference between ArchitectureFeatures and CompilerOptions
+- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+- CompilerOptions is for changing the behaviour of the compiler
+
+"""
+
+    ArchitectureConfig = namedtuple(
+        "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units"
+    )
+    accelerator_configs = {
+        "ethos-u55-256": ArchitectureConfig(256, 1, Block(2, 2, 8), Block(2, 2, 8), 48, [8, 8, 8, 8, 8, 16, 20], 8),
+        "ethos-u55-128": ArchitectureConfig(128, 1, Block(2, 1, 8), Block(2, 2, 8), 24, [4, 4, 4, 4, 4, 8, 12], 4),
+        "ethos-u55-64": ArchitectureConfig(64, 1, Block(1, 1, 8), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 8], 2),
+        "ethos-u55-32": ArchitectureConfig(32, 1, Block(1, 1, 4), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 4], 1),
+    }
+
+    OFMSplitDepth = 16
+
+    def __init__(
+        self,
+        vela_config: ConfigParser,
+        accelerator_config,
+        system_config,
+        permanent_storage,
+        inter_pass_cycle_delay,
+        dram_bandwidth,
+        override_block_config,
+        block_config_limit,
+        global_memory_clock_scale,
+        max_blockdep,
+    ):
+        accelerator_config = accelerator_config.lower()
+        self.vela_config = vela_config
+        self.accelerator_config = accelerator_config
+        if not self.accelerator_config in ArchitectureFeatures.accelerator_configs:
+            raise Exception("Unknown accelerator configuration " + self.accelerator_config)
+        accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config]
+        self.config = accel_config
+
+        self.system_config = system_config
+
+        is_yoda_system = "yoda-" in self.accelerator_config
+
+        if is_yoda_system:
+            self.sram_size = 256 * 1024
+        else:
+            self.sram_size = 200 * 1024 * 1024
+
+        self.ncores = accel_config.cores
+        self.ofm_ublock = accel_config.ofm_ublock
+        self.ifm_ublock = accel_config.ifm_ublock
+        self.subkernel_max = Block(8, 8, 65536)
+        self.ofm_block_max = Block(64, 32, 128)
+        self.override_block_config = override_block_config
+        self.block_config_limit = block_config_limit
+
+        self.global_memory_clock_scale = global_memory_clock_scale
+        if self.global_memory_clock_scale <= 0.0 or self.global_memory_clock_scale > 1.0:
+            raise Exception(
+                "Invalid global_memory_clock_scale = "
+                + str(self.global_memory_clock_scale)
+                + " (must be > 0.0 and <= 1.0)"
+            )
+
+        self.max_blockdep = max_blockdep
+
+        dpu_min_height = accel_config.ofm_ublock.height
+        dpu_min_width = accel_config.ofm_ublock.width
+        dpu_dot_product_width = 8
+        dpu_min_ofm_channels = accel_config.ofm_ublock.depth
+
+        self.num_elem_wise_units = accel_config.elem_units
+        self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels
+
+        self.memory_clock_scales = np.zeros(MemArea.Size)
+        self.memory_port_widths = np.zeros(MemArea.Size)
+
+        # Get system configuration
+        self.__read_sys_config()
+
+        # apply the global memory clock scales to the individual ones from the system config
+        for mem in MemArea.all():
+            self.memory_clock_scales[mem] *= self.global_memory_clock_scale
+
+        self.memory_clocks = self.memory_clock_scales * self.npu_clock
+        self.memory_bandwidths_per_cycle = self.memory_port_widths * self.memory_clock_scales / 8
+
+        if dram_bandwidth != 0:
+            self.memory_bandwidths_per_cycle[MemArea.Dram] = dram_bandwidth * 1e9 / self.npu_clock
+
+        self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock
+
+        # sizes as N x H x W x C. we need to round up to these when allocating storage
+        self.storage_rounding_quantums = {
+            TensorFormat.Unknown: (1, 1, 1, 1),
+            TensorFormat.WeightsCompressed: (1, 1, 1, 1),
+            TensorFormat.NHWC: (1, 1, 1, 1),
+            TensorFormat.NHCWB16: (1, 1, 1, 16),
+        }
+
+        # brick sizes as N x H x W x C. We have to fetch whole bricks at a time
+        self.brick_sizes = {
+            TensorFormat.Unknown: (1, 1, 1, 1),
+            TensorFormat.WeightsCompressed: (1, 1, 1, 1),
+            TensorFormat.NHWC: (1, 1, 1, 1),
+            TensorFormat.NHCWB16: (1, 1, 1, 16),
+        }
+
+        self.inter_pass_cycle_delay = inter_pass_cycle_delay
+
+        self.default_weight_format = TensorFormat.WeightsCompressed
+        self.default_feature_map_format = TensorFormat.NHWC
+
+        if permanent_storage != MemArea.OffChipFlash:
+            self.permanent_storage_mem_area = permanent_storage
+
+        self.tensor_storage_mem_area = {
+            # permanent mem_area
+            TensorPurpose.Weights: self.permanent_storage_mem_area,
+            TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
+        }
+
+        self.tensor_load_mem_area = dict(self.tensor_storage_mem_area)
+
+        if self.tensor_storage_mem_area[TensorPurpose.Weights] in (MemArea.OffChipFlash,):
+            self.tensor_load_mem_area[TensorPurpose.Weights] = MemArea.Sram
+
+        self.min_block_sizes = {
+            NpuBlockType.Default: (dpu_min_height, dpu_min_width),
+            NpuBlockType.VectorProduct: (1, 1),
+            NpuBlockType.ConvolutionMxN: (dpu_min_height, dpu_min_width),
+            NpuBlockType.Pooling: (dpu_min_height, dpu_min_width),
+            NpuBlockType.ConvolutionDepthWise: (dpu_min_height, dpu_min_width),
+            NpuBlockType.ElementWise: (1, 1),
+        }
+
+        self.sub_kernel_limits = {
+            NpuBlockType.Default: (8, 8),
+            NpuBlockType.VectorProduct: (1, 1),
+            NpuBlockType.ConvolutionMxN: (8, 8),
+            NpuBlockType.Pooling: (8, 8),
+            NpuBlockType.ConvolutionDepthWise: (8, 8),
+            NpuBlockType.ElementWise: (1, 1),
+        }
+
+        # weights for scheduler search
+        from .npu_performance import make_bandwidth_array
+
+        self.bandwidth_weights = make_bandwidth_array()
+        self.bandwidth_weights[MemArea.Sram] = 1.0
+        self.bandwidth_weights[MemArea.Dram] = 10.0
+        self.bandwidth_weights[MemArea.OnChipFlash] = 2.0
+        self.bandwidth_weights[MemArea.OffChipFlash] = 20.0
+        self.cycles_weight = 40
+        self.max_sram_used_weight = 1000
+
+        if is_yoda_system:
+            self.max_sram_used_weight = 0
+
+        # Shared Buffer Block allocations
+        self.shram_bank_size = 1024  # bytes
+        self.shram_size_bytes = accel_config.shram_banks * self.shram_bank_size
+        self.shram_reserved_output_banks = 2
+        self.shram_reserved_weight_banks = 0
+        self.shram_reserved_unused_banks = 2 if accel_config.shram_banks > 16 else 0
+        self.shram_total_banks = accel_config.shram_banks - self.shram_reserved_unused_banks
+        self.shram_bank_granules = np.array(accel_config.shram_granules, np.int32)
+
+        # Build a map of acceptable IFM/OFM block configurations up to the maximum
+        # IFM/OFM block size.
+        ifm_block_max = self.get_ifm_block_size(32, self.ofm_block_max, Kernel(8, 8))
+        self.block_config_map = dict()
+        self.generate_block_config_map(Block(ifm_block_max.width, ifm_block_max.height, 128))
+
+        # Setup supported operators and restriction checkers class
+        self.supported_operators = SupportedOperators()
+
+    # Calculate block configuration for ALL known IFM operations and
+    # accumulator sizes. Consumers will need to select their preferred
+    # operation and bit-width at read-time.
+    def generate_block_config(self, width, height, depth):
+        # Number of bytes required for any SRAM element for a FM of given dimensions
+        size_bytes = (SHRAMElements.BitSizes * (height * width * depth)) // 8
+        # Convert byte size (rounded) to size in banks
+        size_banks = round_up_divide(size_bytes, self.shram_bank_size)
+        size_banks *= 2  # Double buffer the IFM/Acc (need twice as many banks)
+        # Round bank requirement to bank granularity
+        required_banks = round_up(size_banks, self.shram_bank_granules)
+        return SHRAMBlockConfig(size_bytes, required_banks)
+
+    @staticmethod
+    def make_block_config_key(width, height, depth):
+        return (int(height), int(width), int(depth))
+
+    def get_block_config(self, width, height, depth):
+        assert depth <= self.ofm_block_max.depth
+        key = ArchitectureFeatures.make_block_config_key(width, height, depth)
+        config = self.block_config_map.get(key, None)
+        return config
+
+    # Generate a key:value map of possible block configurations, where the
+    # key is compounded from the block dimensions: 0x00HHWWCC
+    def generate_block_config_map(self, block: Block):
+        for h in range(1, block.height + 1):
+            for w in range(1, block.width + 1):
+                # All possible IFM/OFM depth values
+                for c in [4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128]:
+                    key = ArchitectureFeatures.make_block_config_key(w, h, c)
+                    self.block_config_map[key] = self.generate_block_config(w, h, c)
+
+    def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
+        assert ifm_bits == 8 or ifm_bits == 16
+        assert ifm_depth > 0
+        ifm_depth = round_up(ifm_depth, self.ifm_ublock.depth)
+        max_block_depth = 32 if ifm_bits == 8 else 16
+        return min(max_block_depth, ifm_depth)
+
+    # Calculate the size of the IFM block given a depth, target OFM block and a kernel
+    def get_ifm_block_size(
+        self, ifm_block_depth, ofm_block: Block, kernel: Kernel, subkernel: Block = Block(8, 8, 65536)
+    ):
+        upscaling = 1
+        # Height
+        ifm_odd_2x_height_enable = 0
+        dilated_kernel_height = ((kernel.height - 1) * kernel.dilation.y) + 1
+        ifm_block_height = (
+            (ofm_block.height - 1) * kernel.stride.y
+            + min(subkernel.height, dilated_kernel_height)
+            + ifm_odd_2x_height_enable
+        ) // upscaling
+
+        if kernel.stride.y == 1:
+            ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height)
+        elif kernel.stride.y == 2:
+            if (self.ofm_ublock.height == 2) and (ifm_block_height % 4 == 2):
+                ifm_block_height = ifm_block_height + 2
+            else:
+                ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height)
+        else:
+            assert False
+
+        # Width
+        ifm_odd_2x_width_enable = 0
+        dilated_kernel_width = ((kernel.width - 1) * kernel.dilation.x) + 1
+        ifm_block_width = (
+            (ofm_block.width - 1) * kernel.stride.x
+            + min(subkernel.width, dilated_kernel_width)
+            + ifm_odd_2x_width_enable
+        ) // upscaling
+
+        if kernel.stride.x == 1:
+            ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width)
+        elif kernel.stride.x == 2:
+            if (self.ofm_ublock.width == 2) and (ifm_block_width % 4 == 2):
+                ifm_block_width = ifm_block_width + 2
+            else:
+                ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width)
+        else:
+            assert False
+
+        return Block(ifm_block_width, ifm_block_height, ifm_block_depth)
+
+    @staticmethod
+    def intersects(start_a, end_a, start_b, end_b):
+        start_x = max(start_a[0], start_b[0])
+        end_x = min(end_a[0], end_b[0])
+        start_y = max(start_a[1], start_b[1])
+        end_y = min(end_a[1], end_b[1])
+        start_z = max(start_a[2], start_b[2])
+        end_z = min(end_a[2], end_b[2])
+        return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
+
+    # Block job dependency:
+    # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
+    #
+    #  A                    | B
+    # ----------------------+------------------
+    # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
+    #               |<------->| dependency offset
+    #
+    MAX_BLOCKDEP = 3
+
+    # Get the coordinates of a block offset from either the end (negative)
+    # or the start (zero or positive) of the given 3d area
+    def get_offset_block_coords(self, area: Rect, block: Block, offset):
+        size = area.size()
+        # Dimensions of the region, in blocks
+        width_blocks = round_up_divide(size.width, block.width)
+        height_blocks = round_up_divide(size.height, block.height)
+        depth_blocks = round_up_divide(size.depth, block.depth)
+        total_blocks = width_blocks * height_blocks * depth_blocks
+        if offset < 0:
+            index = total_blocks + offset
+        else:
+            index = offset
+
+        if index >= total_blocks:
+            return None
+
+        # Coordinates of the indexed block
+        coord_z = block.depth * (index % depth_blocks)
+        coord_y = block.height * (index // (depth_blocks * width_blocks))
+        coord_x = block.width * ((index // depth_blocks) % width_blocks)
+
+        return (coord_x + area.x, coord_y + area.y, coord_z + area.z)
+
+    def get_first_job_input_volume(
+        self, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, padLT, block_offset
+    ):
+        # Get ifm block size (jobs are invisibly decomposed into subkernels)
+        ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max)
+        ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
+
+        # Which OFM block are we calculating
+        ofm_coord = self.get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
+        if ofm_coord is None:
+            return None
+
+        # Coordinate of the source IFM block
+        ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padLT[0])
+        ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padLT[1])
+        ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
+
+        # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
+        start_coord = (ifm_coord_x, ifm_coord_y, ifm_coord_z)
+        end_coord = (
+            start_coord[0] + ifm_block.width,
+            start_coord[1] + ifm_block.height,
+            start_coord[2] + ifm_block.depth,
+        )
+
+        return (start_coord, end_coord, 1)  # start, end, total jobs
+
+    def get_prev_job_output_volume(
+        self, ifm: Block, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, block_offset
+    ):
+        assert block_offset >= 0
+
+        # Get OFM block's volume coordinates
+        start_coord = self.get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
+        if start_coord is None:
+            return None
+        end_coord = (
+            start_coord[0] + ofm_block.width,
+            start_coord[1] + ofm_block.height,
+            start_coord[2] + ofm_block.depth,
+        )
+
+        # Calculate how many IFM blocks this OFM block requires (i.e how many jobs)
+        ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max)
+        ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
+        ifm_depth_blocks = 1  # Overwrite with 1 to force OFM block dependency, not IFM
+
+        return (start_coord, end_coord, ifm_depth_blocks)  # start, end, total jobs for this OFM block
+
+    def calc_block_dep(
+        self,
+        prev_ifm: Block,
+        prev_ofm: Block,
+        prev_ifm_block_depth,
+        prev_ofm_block: Block,
+        prev_kernel: Kernel,
+        ifm: Block,
+        ofm: Block,
+        ifm_block_depth,
+        ofm_block: Block,
+        kernel: Kernel,
+        padLT,
+    ):
+
+        blockdep = ArchitectureFeatures.MAX_BLOCKDEP
+
+        # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
+        # of IFM area overlaps with any previous OFM block generation.
+        elapsed_jobs = 0
+        ifm_depth = ifm.size().depth
+        for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
+            # This is the IFM block we want to sample from
+            in_area = self.get_first_job_input_volume(
+                ifm, ofm, ifm_block_depth, ofm_block, kernel, padLT, forward_offset
+            )
+            if in_area is None:
+                break
+
+            # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
+            outstanding_jobs = 0
+            for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
+                # This is the OFM block being generated by the previous op
+                out_area = self.get_prev_job_output_volume(
+                    prev_ifm, prev_ofm, prev_ifm_block_depth, prev_ofm_block, prev_kernel, block_offset
+                )
+                if out_area is None:
+                    break
+
+                # Block dependency is the max number of allowed outstanding jobs
+                # in the pipeline. Selected by determining how many jobs occur
+                # in between two operators' overlapping OFM->IFM block volumes
+                if ArchitectureFeatures.intersects(in_area[0], in_area[1], out_area[0], out_area[1]):
+                    break
+                # Early exit if no intersections and we've seen enough jobs in the pipeline
+                elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
+                    break
+
+                # This OFM had this many jobs (accumulate over multiple OFM blocks)
+                outstanding_jobs += out_area[2]
+
+            blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
+            elapsed_jobs += in_area[2]
+            # Early exit if no intersections and we've seen enough jobs in the pipeline
+            if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
+                break
+
+        return blockdep
+
+    def cpu_cycle_estimate(self, op):
+        """
+        Gets estimated performance of a CPU operation, based on a linear model of intercept, slope,
+        specified in the vela config file, in ConfigParser file format (.ini file).
+        Example configuration snippet:
+        [CpuPerformance.MyOperationType]
+        Cortex-Mx.intercept=<some float value>
+        Cortex-Mx.slope=<some float value>
+        """
+        section = "CpuPerformance." + op.type
+        if self.vela_config is not None and section in self.vela_config:
+            op_config = self.vela_config[section]
+            try:
+                intercept = float(op_config.get(self.cpu_config + ".intercept", op_config["default.intercept"]))
+                slope = float(op_config.get(self.cpu_config + ".slope", op_config["default.slope"]))
+                n_elements = op.inputs[0].elements()
+                cycles = intercept + n_elements * slope
+                return cycles
+            except:
+                print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section))
+                raise
+
+        print("Warning: No configured CPU performance estimate for", op.type)
+        return 0
+
+    def __read_sys_config(self):
+        """
+        Gets the system configuration with the given name from the vela configuration file
+        Example configuration snippet:
+        [SysConfig.MyConfigName]
+        npu_freq=<some float value>
+        cpu=Cortex-Mx
+        ...
+        """
+        # Get system configuration from the vela configuration file
+        if self.vela_config is None:
+            print("Warning: Using default values for system configuration")
+        else:
+            section_key = "SysConfig." + self.system_config
+            if not section_key in self.vela_config:
+                raise Exception("Unknown system configuration " + self.system_config)
+
+        try:
+            self.npu_clock = float(self.__sys_config("npu_freq", "500e6"))
+            self.cpu_config = self.__sys_config("cpu", "Cortex-M7")
+
+            self.memory_clock_scales[MemArea.Sram] = float(self.__sys_config("Sram_clock_scale", "1"))
+            self.memory_port_widths[MemArea.Sram] = int(self.__sys_config("Sram_port_width", "64"))
+
+            self.memory_clock_scales[MemArea.OnChipFlash] = float(self.__sys_config("OnChipFlash_clock_scale", "1"))
+            self.memory_port_widths[MemArea.OnChipFlash] = int(self.__sys_config("OnChipFlash_port_width", "64"))
+
+            self.memory_clock_scales[MemArea.OffChipFlash] = float(
+                self.__sys_config("OffChipFlash_clock_scale", "0.25")
+            )
+            self.memory_port_widths[MemArea.OffChipFlash] = int(self.__sys_config("OffChipFlash_port_width", "32"))
+
+            self.memory_clock_scales[MemArea.Dram] = float(self.__sys_config("Dram_clock_scale", "1"))
+            self.memory_port_widths[MemArea.Dram] = int(self.__sys_config("Dram_port_width", "32"))
+
+            self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
+            self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
+            self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
+            if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)):
+                raise Exception(
+                    "Invalid permanent_storage_mem_area = "
+                    + str(self.permanent_storage_mem_area)
+                    + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM"
+                    " select 'OnChipFlash'"
+                )
+        except:
+            print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key))
+            raise
+
+    def __sys_config(self, key, default_value):
+        """
+        Gets the system configuration value with the given key from the vela config file.
+        """
+        if self.vela_config is None:
+            return default_value
+        section = "SysConfig." + self.system_config
+        result = self.vela_config[section].get(key, None)
+        if result is None:
+            raise Exception("Error: System Configuration Missing key {} in section [{}] ".format(key, section))
+        return result
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
new file mode 100644
index 0000000..7f8c4ca
--- /dev/null
+++ b/ethosu/vela/compiler_driver.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains the main sequencing of the compiler.
+
+from . import graph_optimiser
+from . import mark_tensors
+from . import insert_dma
+from . import pass_packing
+from . import scheduler
+from . import tensor_allocation
+from . import npu_performance
+import time
+
+from . import high_level_command_stream
+from . import high_level_command_stream_generator
+from . import register_command_stream_generator
+from . import extract_npu_subgraphs
+from . import npu_serialisation
+from . import weight_compressor
+from . import live_range
+from .tensor import MemArea
+from .nn_graph import TensorAllocator, PassPlacement
+from .rewrite_graph import verify_graph_health, verify_subgraph_health
+
+
+class CompilerOptions:
+    """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
+
+Note the difference between ArchitectureFeatures and CompilerOptions
+- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+- CompilerOptions is for changing the behaviour of the compiler
+"""
+
+    def __init__(
+        self,
+        verbose_graph=False,
+        verbose_quantization=False,
+        verbose_packing=False,
+        verbose_tensor_purpose=False,
+        verbose_tensor_format=False,
+        verbose_allocation=False,
+        verbose_high_level_command_stream=False,
+        verbose_register_command_stream=False,
+        verbose_operators=False,
+        show_minimum_possible_allocation=False,
+        show_cpu_operations=False,
+        tensor_allocator=TensorAllocator.Greedy,
+        timing=False,
+        output_dir="outputs",
+    ):
+
+        self.verbose_graph = verbose_graph
+        self.verbose_quantization = verbose_quantization
+        self.verbose_packing = verbose_packing
+        self.verbose_tensor_purpose = verbose_tensor_purpose
+        self.verbose_tensor_format = verbose_tensor_format
+        self.verbose_allocation = verbose_allocation
+        self.verbose_high_level_command_stream = verbose_high_level_command_stream
+        self.verbose_register_command_stream = verbose_register_command_stream
+        self.verbose_operators = verbose_operators
+        self.show_minimum_possible_allocation = show_minimum_possible_allocation
+        self.show_cpu_operations = show_cpu_operations
+        self.tensor_allocator = tensor_allocator
+        self.timing = timing
+        self.output_dir = output_dir
+
+    def __str__(self):
+        return type(self).__name__ + ": " + str(self.__dict__)
+
+    __repr__ = __str__
+
+
+def compiler_driver(nng, arch, options, scheduler_options):
+    assert verify_graph_health(nng)
+    nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
+    assert verify_graph_health(nng)
+
+    if options.verbose_quantization:
+        nng.print_graph_with_tensor_quantization()
+
+    nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
+    assert verify_graph_health(nng)
+
+    nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
+    assert verify_graph_health(nng)
+    nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
+    assert verify_graph_health(nng)
+    pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
+    assert verify_graph_health(nng)
+
+    extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
+
+    mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
+    assert verify_graph_health(nng)
+    if options.timing:
+        start = time.time()
+
+    # Run the scheduler
+    scheduler.schedule_passes(nng, arch, scheduler_options)
+
+    if options.timing:
+        stop = time.time()
+        print("Scheduling took %f s" % (stop - start))
+        start = time.time()
+
+    # Update the compressed weights now that we have determined the
+    # block config, and calc and pack the scales and biases
+    weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
+
+    # Memory area for all non-constant tensors (Cpu and Npu)
+    non_const_mem_area = MemArea.Sram
+
+    # LiveRanges for constant tensors for all Npu subgraphs
+    permanent_storage = arch.permanent_storage_mem_area
+    lr_graph_flash = live_range.LiveRangeGraph()
+
+    # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
+    scratch_tens = None
+    flash_tens = None
+
+    # Calculate live ranges for all constant Npu tensors, in permanent storage
+    for sg in nng.subgraphs:
+        if sg.placement == PassPlacement.Npu:
+            lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
+                sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
+            )
+
+    # Allocate all Npu constant tensors to the first Npu subgraph since it is
+    # processed first during serialization into tensors
+    first_npu_sg = nng.subgraphs[1]
+    assert first_npu_sg.placement == PassPlacement.Npu
+    tensor_allocation.allocate_tensors(
+        nng,
+        first_npu_sg,
+        arch,
+        permanent_storage,
+        scheduler_options.use_ifm_ofm_overlap,
+        options.tensor_allocator,
+        options.verbose_allocation,
+        options.show_minimum_possible_allocation,
+        lr_graph_flash,
+    )
+
+    # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
+    # will start at the root subgraph's input and traverse from top to bottom. When
+    # it comes across an Npu-op it will extract live ranges for it's corresponding
+    # Npu subgraph and add them to the root's live range graph. Finally, all of the
+    # non-constant tensors are allocated together
+    root_sg = nng.get_root_subgraph()
+    tensor_allocation.allocate_tensors(
+        nng,
+        root_sg,
+        arch,
+        non_const_mem_area,
+        scheduler_options.use_ifm_ofm_overlap,
+        options.tensor_allocator,
+        options.verbose_allocation,
+        options.show_minimum_possible_allocation,
+    )
+
+    # Generate command streams and serialise Npu-ops into tensors
+    for sg in nng.subgraphs:
+        high_level_command_stream_generator.generate_high_level_command_stream(
+            nng, sg, arch, options.verbose_high_level_command_stream
+        )
+        register_command_stream_generator.generate_register_command_stream(
+            nng, sg, arch, options.verbose_register_command_stream
+        )
+        scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
+            nng, sg, arch, scratch_tens, flash_tens
+        )
+
+    npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
+
+    # Allocate all Cpu constant tensors, this is done last because the Npu-ops
+    # have to be serialized into flash and scratch tensors first
+    tensor_allocation.allocate_tensors(
+        nng,
+        root_sg,
+        arch,
+        permanent_storage,
+        scheduler_options.use_ifm_ofm_overlap,
+        options.tensor_allocator,
+        options.verbose_allocation,
+        options.show_minimum_possible_allocation,
+    )
+
+    npu_performance.calc_performance_for_network(nng, arch)
diff --git a/ethosu/vela/data_type.py b/ethosu/vela/data_type.py
new file mode 100644
index 0000000..1d3e94e
--- /dev/null
+++ b/ethosu/vela/data_type.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Defines the basic numeric type classes for tensors.
+
+from .numeric_util import round_up_divide
+import enum
+
+
+class BaseType(enum.Flag):
+    Signed = 1
+    Unsigned = 2
+    Asymmetric = 4
+    Int = 8
+    SignedInt = Int | Signed
+    UnsignedInt = Int | Unsigned
+    AsymmSInt = Int | Asymmetric | Signed
+    AsymmUInt = Int | Asymmetric | Unsigned
+    Float = 16
+    BFloat = 32
+    Bool = 64
+    String = 128
+    Resource = 256
+    Variant = 512
+
+
+class DataType:
+    """Defines a data type. Consists of a base type, and the number of bits used for this type"""
+
+    __slots__ = "type", "bits"
+
+    def __init__(self, type_, bits):
+        self.type = type_
+        self.bits = bits
+
+    def __eq__(self, other):
+        return self.type == other.type and self.bits == other.bits
+
+    def __hash__(self):
+        return hash((self.type, self.bits))
+
+    def size_in_bytes(self):
+        return round_up_divide(self.bits, 8)
+
+    def size_in_bits(self):
+        return self.bits
+
+    def __str__(self):
+        stem, needs_format = DataType.stem_name[self.type]
+        if not needs_format:
+            return stem
+        else:
+            return stem % (self.bits,)
+
+    __repr__ = __str__
+
+    stem_name = {
+        BaseType.UnsignedInt: ("uint%s", True),
+        BaseType.SignedInt: ("int%s", True),
+        BaseType.AsymmUInt: ("quint%s", True),
+        BaseType.AsymmSInt: ("qint%s", True),
+        BaseType.Float: ("float%s", True),
+        BaseType.BFloat: ("bfloat%s", True),
+        BaseType.Bool: ("bool", False),
+        BaseType.String: ("string", False),
+        BaseType.Resource: ("resource", False),
+        BaseType.Variant: ("variant", False),
+    }
+
+
+# generate the standard set of data types
+DataType.int8 = DataType(BaseType.SignedInt, 8)
+DataType.int16 = DataType(BaseType.SignedInt, 16)
+DataType.int32 = DataType(BaseType.SignedInt, 32)
+DataType.int64 = DataType(BaseType.SignedInt, 64)
+
+DataType.uint8 = DataType(BaseType.UnsignedInt, 8)
+DataType.uint16 = DataType(BaseType.UnsignedInt, 16)
+DataType.uint32 = DataType(BaseType.UnsignedInt, 32)
+DataType.uint64 = DataType(BaseType.UnsignedInt, 64)
+
+DataType.quint4 = DataType(BaseType.AsymmUInt, 4)
+DataType.quint8 = DataType(BaseType.AsymmUInt, 8)
+DataType.quint12 = DataType(BaseType.AsymmUInt, 12)
+DataType.quint16 = DataType(BaseType.AsymmUInt, 16)
+DataType.quint32 = DataType(BaseType.AsymmUInt, 32)
+
+DataType.qint4 = DataType(BaseType.AsymmSInt, 4)
+DataType.qint8 = DataType(BaseType.AsymmSInt, 8)
+DataType.qint12 = DataType(BaseType.AsymmSInt, 12)
+DataType.qint16 = DataType(BaseType.AsymmSInt, 16)
+DataType.qint32 = DataType(BaseType.AsymmSInt, 32)
+
+DataType.float16 = DataType(BaseType.Float, 16)
+DataType.float32 = DataType(BaseType.Float, 32)
+DataType.float64 = DataType(BaseType.Float, 64)
+
+DataType.string = DataType(BaseType.String, 64)
+DataType.bool = DataType(BaseType.Bool, 8)
+DataType.resource = DataType(BaseType.Resource, 8)
+DataType.variant = DataType(BaseType.Variant, 8)
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py
new file mode 100644
index 0000000..86c4a36
--- /dev/null
+++ b/ethosu/vela/driver_actions.py
@@ -0,0 +1,107 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Creates driver actions that are embedded in the custom operator payload.
+
+import numpy as np
+from typing import List
+from .ethos_u55_regs.ethos_u55_regs import *
+
+
+class DACommands:
+    Reserved = 0x00
+    Config = 0x01
+    Config_PatchShift = 4
+    CmdStream = 0x02
+    ReadAPB = 0x03
+    ReadAPB_CountShift = 12
+    ReadAPB_IndexMask = (1 << ReadAPB_CountShift) - 1
+    DumpSHRAM = 0x04
+    NOP = 0x05
+
+
+def make_da_tag(id: int, reserved: int, param: int) -> int:
+    tag: int = id
+    tag |= reserved << 8
+    tag |= param << 16
+    return tag
+
+
+def emit_fourcc(data: List[int], fourcc: str):
+    assert data != None
+    assert fourcc != None
+    assert len(fourcc) == 4
+    value: int = 0
+    value = fourcc[0].encode()[0]
+    value |= fourcc[1].encode()[0] << 8
+    value |= fourcc[2].encode()[0] << 16
+    value |= fourcc[3].encode()[0] << 24
+    data.append(value)
+
+
+def build_id_word():
+    arch_major_rev, arch_minor_rev, arch_patch_rev = (int(x) for x in ARCH_VER.split("."))
+    n = id_r()
+    n.set_arch_major_rev(arch_major_rev)
+    n.set_arch_minor_rev(arch_minor_rev)
+    n.set_arch_patch_rev(arch_patch_rev)
+    return n.word
+
+
+def build_config_word(arch):
+    macs_cc = arch.config.macs
+    log2_macs_cc = int(np.log2(macs_cc) + 0.5)
+    shram_size = int(arch.shram_size_bytes / 1024)
+    n = config_r()
+    n.set_shram_size(shram_size)
+    n.set_cmd_stream_version(0)  # may be incremented in the future
+    n.set_macs_per_cc(log2_macs_cc)
+    return n.word
+
+
+def emit_config(data: List[int], rel: int, patch: int, arch):
+    assert data != None
+    data.append(make_da_tag(DACommands.Config, 0, (patch << DACommands.Config_PatchShift) | rel))
+    data.append(build_config_word(arch))
+    data.append(build_id_word())
+
+
+def emit_cmd_stream_header(data: List[int], length: int):
+    assert data != None
+    # Insert NOPs to align start of command stream to 16 bytes
+    num_nops = 4 - ((len(data) + 1) % 4)
+    for _ in range(num_nops):
+        data.append(make_da_tag(DACommands.NOP, 0, 0))
+
+    # Use the reserved 8 bit as the length high
+    length_high = (length & 0x00FF0000) >> 16
+    length_low = length & 0x0000FFFF
+    data.append(make_da_tag(DACommands.CmdStream, length_high, length_low))
+
+
+def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1):
+    assert data != None
+    assert reg_index >= 0
+    assert reg_count >= 1
+    payload: int = (reg_index & DACommands.ReadAPB_IndexMask) | ((reg_count << DACommands.ReadAPB_CountShift) - 1)
+    data.append(make_da_tag(DACommands.ReadAPB, 0, payload))
+
+
+def emit_dump_shram(data: List[int]):
+    assert data != None
+    data.append(make_da_tag(DACommands.DumpSHRAM, 0, 0))
diff --git a/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py
new file mode 100644
index 0000000..37f7a67
--- /dev/null
+++ b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py
@@ -0,0 +1,3138 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ctypes import *
+from enum import Enum
+
+ARCH_VER = '0.154.0'
+
+
+class DEBUG_INTERNAL(Enum):
+    SHARED_BUFFER0 = 0x0400
+    SHARED_BUFFER1 = 0x0404
+    SHARED_BUFFER2 = 0x0408
+    SHARED_BUFFER3 = 0x040C
+    SHARED_BUFFER4 = 0x0410
+    SHARED_BUFFER5 = 0x0414
+    SHARED_BUFFER6 = 0x0418
+    SHARED_BUFFER7 = 0x041C
+    SHARED_BUFFER8 = 0x0420
+    SHARED_BUFFER9 = 0x0424
+    SHARED_BUFFER10 = 0x0428
+    SHARED_BUFFER11 = 0x042C
+    SHARED_BUFFER12 = 0x0430
+    SHARED_BUFFER13 = 0x0434
+    SHARED_BUFFER14 = 0x0438
+    SHARED_BUFFER15 = 0x043C
+    SHARED_BUFFER16 = 0x0440
+    SHARED_BUFFER17 = 0x0444
+    SHARED_BUFFER18 = 0x0448
+    SHARED_BUFFER19 = 0x044C
+    SHARED_BUFFER20 = 0x0450
+    SHARED_BUFFER21 = 0x0454
+    SHARED_BUFFER22 = 0x0458
+    SHARED_BUFFER23 = 0x045C
+    SHARED_BUFFER24 = 0x0460
+    SHARED_BUFFER25 = 0x0464
+    SHARED_BUFFER26 = 0x0468
+    SHARED_BUFFER27 = 0x046C
+    SHARED_BUFFER28 = 0x0470
+    SHARED_BUFFER29 = 0x0474
+    SHARED_BUFFER30 = 0x0478
+    SHARED_BUFFER31 = 0x047C
+    SHARED_BUFFER32 = 0x0480
+    SHARED_BUFFER33 = 0x0484
+    SHARED_BUFFER34 = 0x0488
+    SHARED_BUFFER35 = 0x048C
+    SHARED_BUFFER36 = 0x0490
+    SHARED_BUFFER37 = 0x0494
+    SHARED_BUFFER38 = 0x0498
+    SHARED_BUFFER39 = 0x049C
+    SHARED_BUFFER40 = 0x04A0
+    SHARED_BUFFER41 = 0x04A4
+    SHARED_BUFFER42 = 0x04A8
+    SHARED_BUFFER43 = 0x04AC
+    SHARED_BUFFER44 = 0x04B0
+    SHARED_BUFFER45 = 0x04B4
+    SHARED_BUFFER46 = 0x04B8
+    SHARED_BUFFER47 = 0x04BC
+    SHARED_BUFFER48 = 0x04C0
+    SHARED_BUFFER49 = 0x04C4
+    SHARED_BUFFER50 = 0x04C8
+    SHARED_BUFFER51 = 0x04CC
+    SHARED_BUFFER52 = 0x04D0
+    SHARED_BUFFER53 = 0x04D4
+    SHARED_BUFFER54 = 0x04D8
+    SHARED_BUFFER55 = 0x04DC
+    SHARED_BUFFER56 = 0x04E0
+    SHARED_BUFFER57 = 0x04E4
+    SHARED_BUFFER58 = 0x04E8
+    SHARED_BUFFER59 = 0x04EC
+    SHARED_BUFFER60 = 0x04F0
+    SHARED_BUFFER61 = 0x04F4
+    SHARED_BUFFER62 = 0x04F8
+    SHARED_BUFFER63 = 0x04FC
+    SHARED_BUFFER64 = 0x0500
+    SHARED_BUFFER65 = 0x0504
+    SHARED_BUFFER66 = 0x0508
+    SHARED_BUFFER67 = 0x050C
+    SHARED_BUFFER68 = 0x0510
+    SHARED_BUFFER69 = 0x0514
+    SHARED_BUFFER70 = 0x0518
+    SHARED_BUFFER71 = 0x051C
+    SHARED_BUFFER72 = 0x0520
+    SHARED_BUFFER73 = 0x0524
+    SHARED_BUFFER74 = 0x0528
+    SHARED_BUFFER75 = 0x052C
+    SHARED_BUFFER76 = 0x0530
+    SHARED_BUFFER77 = 0x0534
+    SHARED_BUFFER78 = 0x0538
+    SHARED_BUFFER79 = 0x053C
+    SHARED_BUFFER80 = 0x0540
+    SHARED_BUFFER81 = 0x0544
+    SHARED_BUFFER82 = 0x0548
+    SHARED_BUFFER83 = 0x054C
+    SHARED_BUFFER84 = 0x0550
+    SHARED_BUFFER85 = 0x0554
+    SHARED_BUFFER86 = 0x0558
+    SHARED_BUFFER87 = 0x055C
+    SHARED_BUFFER88 = 0x0560
+    SHARED_BUFFER89 = 0x0564
+    SHARED_BUFFER90 = 0x0568
+    SHARED_BUFFER91 = 0x056C
+    SHARED_BUFFER92 = 0x0570
+    SHARED_BUFFER93 = 0x0574
+    SHARED_BUFFER94 = 0x0578
+    SHARED_BUFFER95 = 0x057C
+    SHARED_BUFFER96 = 0x0580
+    SHARED_BUFFER97 = 0x0584
+    SHARED_BUFFER98 = 0x0588
+    SHARED_BUFFER99 = 0x058C
+    SHARED_BUFFER100 = 0x0590
+    SHARED_BUFFER101 = 0x0594
+    SHARED_BUFFER102 = 0x0598
+    SHARED_BUFFER103 = 0x059C
+    SHARED_BUFFER104 = 0x05A0
+    SHARED_BUFFER105 = 0x05A4
+    SHARED_BUFFER106 = 0x05A8
+    SHARED_BUFFER107 = 0x05AC
+    SHARED_BUFFER108 = 0x05B0
+    SHARED_BUFFER109 = 0x05B4
+    SHARED_BUFFER110 = 0x05B8
+    SHARED_BUFFER111 = 0x05BC
+    SHARED_BUFFER112 = 0x05C0
+    SHARED_BUFFER113 = 0x05C4
+    SHARED_BUFFER114 = 0x05C8
+    SHARED_BUFFER115 = 0x05CC
+    SHARED_BUFFER116 = 0x05D0
+    SHARED_BUFFER117 = 0x05D4
+    SHARED_BUFFER118 = 0x05D8
+    SHARED_BUFFER119 = 0x05DC
+    SHARED_BUFFER120 = 0x05E0
+    SHARED_BUFFER121 = 0x05E4
+    SHARED_BUFFER122 = 0x05E8
+    SHARED_BUFFER123 = 0x05EC
+    SHARED_BUFFER124 = 0x05F0
+    SHARED_BUFFER125 = 0x05F4
+    SHARED_BUFFER126 = 0x05F8
+    SHARED_BUFFER127 = 0x05FC
+    SHARED_BUFFER128 = 0x0600
+    SHARED_BUFFER129 = 0x0604
+    SHARED_BUFFER130 = 0x0608
+    SHARED_BUFFER131 = 0x060C
+    SHARED_BUFFER132 = 0x0610
+    SHARED_BUFFER133 = 0x0614
+    SHARED_BUFFER134 = 0x0618
+    SHARED_BUFFER135 = 0x061C
+    SHARED_BUFFER136 = 0x0620
+    SHARED_BUFFER137 = 0x0624
+    SHARED_BUFFER138 = 0x0628
+    SHARED_BUFFER139 = 0x062C
+    SHARED_BUFFER140 = 0x0630
+    SHARED_BUFFER141 = 0x0634
+    SHARED_BUFFER142 = 0x0638
+    SHARED_BUFFER143 = 0x063C
+    SHARED_BUFFER144 = 0x0640
+    SHARED_BUFFER145 = 0x0644
+    SHARED_BUFFER146 = 0x0648
+    SHARED_BUFFER147 = 0x064C
+    SHARED_BUFFER148 = 0x0650
+    SHARED_BUFFER149 = 0x0654
+    SHARED_BUFFER150 = 0x0658
+    SHARED_BUFFER151 = 0x065C
+    SHARED_BUFFER152 = 0x0660
+    SHARED_BUFFER153 = 0x0664
+    SHARED_BUFFER154 = 0x0668
+    SHARED_BUFFER155 = 0x066C
+    SHARED_BUFFER156 = 0x0670
+    SHARED_BUFFER157 = 0x0674
+    SHARED_BUFFER158 = 0x0678
+    SHARED_BUFFER159 = 0x067C
+    SHARED_BUFFER160 = 0x0680
+    SHARED_BUFFER161 = 0x0684
+    SHARED_BUFFER162 = 0x0688
+    SHARED_BUFFER163 = 0x068C
+    SHARED_BUFFER164 = 0x0690
+    SHARED_BUFFER165 = 0x0694
+    SHARED_BUFFER166 = 0x0698
+    SHARED_BUFFER167 = 0x069C
+    SHARED_BUFFER168 = 0x06A0
+    SHARED_BUFFER169 = 0x06A4
+    SHARED_BUFFER170 = 0x06A8
+    SHARED_BUFFER171 = 0x06AC
+    SHARED_BUFFER172 = 0x06B0
+    SHARED_BUFFER173 = 0x06B4
+    SHARED_BUFFER174 = 0x06B8
+    SHARED_BUFFER175 = 0x06BC
+    SHARED_BUFFER176 = 0x06C0
+    SHARED_BUFFER177 = 0x06C4
+    SHARED_BUFFER178 = 0x06C8
+    SHARED_BUFFER179 = 0x06CC
+    SHARED_BUFFER180 = 0x06D0
+    SHARED_BUFFER181 = 0x06D4
+    SHARED_BUFFER182 = 0x06D8
+    SHARED_BUFFER183 = 0x06DC
+    SHARED_BUFFER184 = 0x06E0
+    SHARED_BUFFER185 = 0x06E4
+    SHARED_BUFFER186 = 0x06E8
+    SHARED_BUFFER187 = 0x06EC
+    SHARED_BUFFER188 = 0x06F0
+    SHARED_BUFFER189 = 0x06F4
+    SHARED_BUFFER190 = 0x06F8
+    SHARED_BUFFER191 = 0x06FC
+    SHARED_BUFFER192 = 0x0700
+    SHARED_BUFFER193 = 0x0704
+    SHARED_BUFFER194 = 0x0708
+    SHARED_BUFFER195 = 0x070C
+    SHARED_BUFFER196 = 0x0710
+    SHARED_BUFFER197 = 0x0714
+    SHARED_BUFFER198 = 0x0718
+    SHARED_BUFFER199 = 0x071C
+    SHARED_BUFFER200 = 0x0720
+    SHARED_BUFFER201 = 0x0724
+    SHARED_BUFFER202 = 0x0728
+    SHARED_BUFFER203 = 0x072C
+    SHARED_BUFFER204 = 0x0730
+    SHARED_BUFFER205 = 0x0734
+    SHARED_BUFFER206 = 0x0738
+    SHARED_BUFFER207 = 0x073C
+    SHARED_BUFFER208 = 0x0740
+    SHARED_BUFFER209 = 0x0744
+    SHARED_BUFFER210 = 0x0748
+    SHARED_BUFFER211 = 0x074C
+    SHARED_BUFFER212 = 0x0750
+    SHARED_BUFFER213 = 0x0754
+    SHARED_BUFFER214 = 0x0758
+    SHARED_BUFFER215 = 0x075C
+    SHARED_BUFFER216 = 0x0760
+    SHARED_BUFFER217 = 0x0764
+    SHARED_BUFFER218 = 0x0768
+    SHARED_BUFFER219 = 0x076C
+    SHARED_BUFFER220 = 0x0770
+    SHARED_BUFFER221 = 0x0774
+    SHARED_BUFFER222 = 0x0778
+    SHARED_BUFFER223 = 0x077C
+    SHARED_BUFFER224 = 0x0780
+    SHARED_BUFFER225 = 0x0784
+    SHARED_BUFFER226 = 0x0788
+    SHARED_BUFFER227 = 0x078C
+    SHARED_BUFFER228 = 0x0790
+    SHARED_BUFFER229 = 0x0794
+    SHARED_BUFFER230 = 0x0798
+    SHARED_BUFFER231 = 0x079C
+    SHARED_BUFFER232 = 0x07A0
+    SHARED_BUFFER233 = 0x07A4
+    SHARED_BUFFER234 = 0x07A8
+    SHARED_BUFFER235 = 0x07AC
+    SHARED_BUFFER236 = 0x07B0
+    SHARED_BUFFER237 = 0x07B4
+    SHARED_BUFFER238 = 0x07B8
+    SHARED_BUFFER239 = 0x07BC
+    SHARED_BUFFER240 = 0x07C0
+    SHARED_BUFFER241 = 0x07C4
+    SHARED_BUFFER242 = 0x07C8
+    SHARED_BUFFER243 = 0x07CC
+    SHARED_BUFFER244 = 0x07D0
+    SHARED_BUFFER245 = 0x07D4
+    SHARED_BUFFER246 = 0x07D8
+    SHARED_BUFFER247 = 0x07DC
+    SHARED_BUFFER248 = 0x07E0
+    SHARED_BUFFER249 = 0x07E4
+    SHARED_BUFFER250 = 0x07E8
+    SHARED_BUFFER251 = 0x07EC
+    SHARED_BUFFER252 = 0x07F0
+    SHARED_BUFFER253 = 0x07F4
+    SHARED_BUFFER254 = 0x07F8
+    SHARED_BUFFER255 = 0x07FC
+    SIZE = 0x0800
+
+class HW_DEBUG_INTERNAL(Enum):
+    CLKFORCE = 0x0140
+    DEBUG = 0x0144
+    DEBUG2 = 0x0148
+    DEBUGCORE = 0x014C
+    SIZE = 0x0150
+
+class NPU_BP(Enum):
+    BASEP0 = 0x0080
+    BASEP1 = 0x0084
+    BASEP2 = 0x0088
+    BASEP3 = 0x008C
+    BASEP4 = 0x0090
+    BASEP5 = 0x0094
+    BASEP6 = 0x0098
+    BASEP7 = 0x009C
+    BASEP8 = 0x00A0
+    BASEP9 = 0x00A4
+    BASEP10 = 0x00A8
+    BASEP11 = 0x00AC
+    BASEP12 = 0x00B0
+    BASEP13 = 0x00B4
+    BASEP14 = 0x00B8
+    BASEP15 = 0x00BC
+    SIZE = 0x00C0
+
+class NPU_IDS(Enum):
+    REVISION = 0x0FC0
+    PID4 = 0x0FD0
+    PID5 = 0x0FD4
+    PID6 = 0x0FD8
+    PID7 = 0x0FDC
+    PID0 = 0x0FE0
+    PID1 = 0x0FE4
+    PID2 = 0x0FE8
+    PID3 = 0x0FEC
+    CID0 = 0x0FF0
+    CID1 = 0x0FF4
+    CID2 = 0x0FF8
+    CID3 = 0x0FFC
+    SIZE = 0x1000
+
+class NPU_REG(Enum):
+    ID = 0x0000
+    STATUS = 0x0004
+    CMD = 0x0008
+    RESET = 0x000C
+    QBASE0 = 0x0010
+    QBASE1 = 0x0014
+    QREAD = 0x0018
+    QCONFIG = 0x001C
+    QSIZE = 0x0020
+    PROT = 0x0024
+    CONFIG = 0x0028
+    LOCK = 0x002C
+    REGIONCFG = 0x003C
+    AXI_LIMIT0 = 0x0040
+    AXI_LIMIT1 = 0x0044
+    AXI_LIMIT2 = 0x0048
+    AXI_LIMIT3 = 0x004C
+    SIZE = 0x0050
+
+class PMU_INTERNAL(Enum):
+    PMCR = 0x0180
+    PMCNTENSET = 0x0184
+    PMCNTENCLR = 0x0188
+    PMOVSSET = 0x018C
+    PMOVSCLR = 0x0190
+    PMINTSET = 0x0194
+    PMINTCLR = 0x0198
+    PMCCNTR_LO = 0x01A0
+    PMCCNTR_HI = 0x01A4
+    PMCCNTR_CFG = 0x01A8
+    PMCAXI_CHAN = 0x01AC
+    PMEVCNTR0 = 0x0300
+    PMEVCNTR1 = 0x0304
+    PMEVCNTR2 = 0x0308
+    PMEVCNTR3 = 0x030C
+    PMEVTYPER0 = 0x0380
+    PMEVTYPER1 = 0x0384
+    PMEVTYPER2 = 0x0388
+    PMEVTYPER3 = 0x038C
+    SIZE = 0x0390
+
+class TSU_DEBUG_INTERNAL(Enum):
+    IFM_PAD_TOP = 0x0800
+    IFM_PAD_LEFT = 0x0804
+    IFM_PAD_RIGHT = 0x0808
+    IFM_PAD_BOTTOM = 0x080C
+    IFM_DEPTH_M1 = 0x0810
+    IFM_PRECISION = 0x0814
+    IFM_UPSCALE = 0x081C
+    IFM_ZERO_POINT = 0x0824
+    IFM_WIDTH0_M1 = 0x0828
+    IFM_HEIGHT0_M1 = 0x082C
+    IFM_HEIGHT1_M1 = 0x0830
+    IFM_IB_END = 0x0834
+    IFM_REGION = 0x083C
+    OFM_WIDTH_M1 = 0x0844
+    OFM_HEIGHT_M1 = 0x0848
+    OFM_DEPTH_M1 = 0x084C
+    OFM_PRECISION = 0x0850
+    OFM_BLK_WIDTH_M1 = 0x0854
+    OFM_BLK_HEIGHT_M1 = 0x0858
+    OFM_BLK_DEPTH_M1 = 0x085C
+    OFM_ZERO_POINT = 0x0860
+    OFM_WIDTH0_M1 = 0x0868
+    OFM_HEIGHT0_M1 = 0x086C
+    OFM_HEIGHT1_M1 = 0x0870
+    OFM_REGION = 0x087C
+    KERNEL_WIDTH_M1 = 0x0880
+    KERNEL_HEIGHT_M1 = 0x0884
+    KERNEL_STRIDE = 0x0888
+    PARALLEL_MODE = 0x088C
+    ACC_FORMAT = 0x0890
+    ACTIVATION = 0x0894
+    ACTIVATION_MIN = 0x0898
+    ACTIVATION_MAX = 0x089C
+    WEIGHT_REGION = 0x08A0
+    SCALE_REGION = 0x08A4
+    AB_START = 0x08B4
+    BLOCKDEP = 0x08BC
+    DMA0_SRC_REGION = 0x08C0
+    DMA0_DST_REGION = 0x08C4
+    DMA0_SIZE0 = 0x08C8
+    DMA0_SIZE1 = 0x08CC
+    IFM2_BROADCAST = 0x0900
+    IFM2_SCALAR = 0x0904
+    IFM2_PRECISION = 0x0914
+    IFM2_ZERO_POINT = 0x0924
+    IFM2_WIDTH0_M1 = 0x0928
+    IFM2_HEIGHT0_M1 = 0x092C
+    IFM2_HEIGHT1_M1 = 0x0930
+    IFM2_IB_START = 0x0934
+    IFM2_REGION = 0x093C
+    IFM_BASE0 = 0x0A00
+    IFM_BASE0_HI = 0x0A04
+    IFM_BASE1 = 0x0A08
+    IFM_BASE1_HI = 0x0A0C
+    IFM_BASE2 = 0x0A10
+    IFM_BASE2_HI = 0x0A14
+    IFM_BASE3 = 0x0A18
+    IFM_BASE3_HI = 0x0A1C
+    IFM_STRIDE_X = 0x0A20
+    IFM_STRIDE_X_HI = 0x0A24
+    IFM_STRIDE_Y = 0x0A28
+    IFM_STRIDE_Y_HI = 0x0A2C
+    IFM_STRIDE_C = 0x0A30
+    IFM_STRIDE_C_HI = 0x0A34
+    OFM_BASE0 = 0x0A40
+    OFM_BASE0_HI = 0x0A44
+    OFM_BASE1 = 0x0A48
+    OFM_BASE1_HI = 0x0A4C
+    OFM_BASE2 = 0x0A50
+    OFM_BASE2_HI = 0x0A54
+    OFM_BASE3 = 0x0A58
+    OFM_BASE3_HI = 0x0A5C
+    OFM_STRIDE_X = 0x0A60
+    OFM_STRIDE_X_HI = 0x0A64
+    OFM_STRIDE_Y = 0x0A68
+    OFM_STRIDE_Y_HI = 0x0A6C
+    OFM_STRIDE_C = 0x0A70
+    OFM_STRIDE_C_HI = 0x0A74
+    WEIGHT_BASE = 0x0A80
+    WEIGHT_BASE_HI = 0x0A84
+    WEIGHT_LENGTH = 0x0A88
+    WEIGHT_LENGTH_HI = 0x0A8C
+    SCALE_BASE = 0x0A90
+    SCALE_BASE_HI = 0x0A94
+    SCALE_LENGTH = 0x0A98
+    OFM_SCALE = 0x0AA0
+    OFM_SCALE_SHIFT = 0x0AA4
+    OPA_SCALE = 0x0AA8
+    OPA_SCALE_SHIFT = 0x0AAC
+    OPB_SCALE = 0x0AB0
+    DMA0_SRC = 0x0AC0
+    DMA0_SRC_HI = 0x0AC4
+    DMA0_DST = 0x0AC8
+    DMA0_DST_HI = 0x0ACC
+    DMA0_LEN = 0x0AD0
+    DMA0_LEN_HI = 0x0AD4
+    DMA0_SKIP0 = 0x0AD8
+    DMA0_SKIP0_HI = 0x0ADC
+    DMA0_SKIP1 = 0x0AE0
+    DMA0_SKIP1_HI = 0x0AE4
+    IFM2_BASE0 = 0x0B00
+    IFM2_BASE0_HI = 0x0B04
+    IFM2_BASE1 = 0x0B08
+    IFM2_BASE1_HI = 0x0B0C
+    IFM2_BASE2 = 0x0B10
+    IFM2_BASE2_HI = 0x0B14
+    IFM2_BASE3 = 0x0B18
+    IFM2_BASE3_HI = 0x0B1C
+    IFM2_STRIDE_X = 0x0B20
+    IFM2_STRIDE_X_HI = 0x0B24
+    IFM2_STRIDE_Y = 0x0B28
+    IFM2_STRIDE_Y_HI = 0x0B2C
+    IFM2_STRIDE_C = 0x0B30
+    IFM2_STRIDE_C_HI = 0x0B34
+    WEIGHT1_BASE = 0x0B40
+    WEIGHT1_BASE_HI = 0x0B44
+    WEIGHT1_LENGTH = 0x0B48
+    WEIGHT1_LENGTH_HI = 0x0B4C
+    SCALE1_BASE = 0x0B50
+    SCALE1_BASE_HI = 0x0B54
+    SCALE1_LENGTH = 0x0B58
+    SIZE = 0x0B5C
+
+class TSU_DEBUG_RO_INTERNAL(Enum):
+    KERNEL_X = 0x0200
+    KERNEL_Y = 0x0204
+    KERNEL_W_M1 = 0x0208
+    KERNEL_H_M1 = 0x020C
+    OFM_CBLK_WIDTH_M1 = 0x0210
+    OFM_CBLK_HEIGHT_M1 = 0x0214
+    OFM_CBLK_DEPTH_M1 = 0x0218
+    IFM_CBLK_DEPTH_M1 = 0x021C
+    OFM_X = 0x0220
+    OFM_Y = 0x0224
+    OFM_Z = 0x0228
+    IFM_Z = 0x022C
+    PAD_TOP = 0x0230
+    PAD_LEFT = 0x0234
+    IFM_CBLK_WIDTH = 0x0238
+    IFM_CBLK_HEIGHT = 0x023C
+    DMA_IFM_SRC = 0x0240
+    DMA_IFM_SRC_HI = 0x0244
+    DMA_IFM_DST = 0x0248
+    DMA_OFM_SRC = 0x024C
+    DMA_OFM_DST = 0x0250
+    DMA_OFM_DST_HI = 0x0254
+    DMA_WEIGHT_SRC = 0x0258
+    DMA_WEIGHT_SRC_HI = 0x025C
+    DMA_CMD_SRC = 0x0260
+    DMA_CMD_SRC_HI = 0x0264
+    DMA_CMD_SIZE = 0x0268
+    DMA_M2M_SRC = 0x026C
+    DMA_M2M_SRC_HI = 0x0270
+    DMA_M2M_DST = 0x0274
+    DMA_M2M_DST_HI = 0x0278
+    CURRENT_QREAD = 0x027C
+    DMA_SCALE_SRC = 0x0280
+    DMA_SCALE_SRC_HI = 0x0284
+    CURRENT_CMD = 0x02BC
+    SIZE = 0x02C0
+
+
+
+class acc_format(Enum):
+    INT_32BIT = 0
+    INT_40BIT = 1
+    FP_S5_10 = 2
+
+class activation(Enum):
+    NONE = 0
+    TANH = 3
+    SIGMOID = 4
+    LUT_START = 16
+    LUT_END = 23
+
+class clip_range(Enum):
+    OFM_PRECISION = 0
+    FORCE_UINT8 = 2
+    FORCE_INT8 = 3
+    FORCE_INT16 = 5
+
+class cmd0(Enum):
+    NPU_OP_STOP = 0x000
+    NPU_OP_IRQ = 0x001
+    NPU_OP_CONV = 0x002
+    NPU_OP_DEPTHWISE = 0x003
+    NPU_OP_POOL = 0x005
+    NPU_OP_ELEMENTWISE = 0x006
+    NPU_OP_DMA_START = 0x010
+    NPU_OP_DMA_WAIT = 0x011
+    NPU_OP_KERNEL_WAIT = 0x012
+    NPU_OP_PMU_MASK = 0x013
+    NPU_SET_IFM_PAD_TOP = 0x100
+    NPU_SET_IFM_PAD_LEFT = 0x101
+    NPU_SET_IFM_PAD_RIGHT = 0x102
+    NPU_SET_IFM_PAD_BOTTOM = 0x103
+    NPU_SET_IFM_DEPTH_M1 = 0x104
+    NPU_SET_IFM_PRECISION = 0x105
+    NPU_SET_IFM_UPSCALE = 0x107
+    NPU_SET_IFM_ZERO_POINT = 0x109
+    NPU_SET_IFM_WIDTH0_M1 = 0x10A
+    NPU_SET_IFM_HEIGHT0_M1 = 0x10B
+    NPU_SET_IFM_HEIGHT1_M1 = 0x10C
+    NPU_SET_IFM_IB_END = 0x10D
+    NPU_SET_IFM_REGION = 0x10F
+    NPU_SET_OFM_WIDTH_M1 = 0x111
+    NPU_SET_OFM_HEIGHT_M1 = 0x112
+    NPU_SET_OFM_DEPTH_M1 = 0x113
+    NPU_SET_OFM_PRECISION = 0x114
+    NPU_SET_OFM_BLK_WIDTH_M1 = 0x115
+    NPU_SET_OFM_BLK_HEIGHT_M1 = 0x116
+    NPU_SET_OFM_BLK_DEPTH_M1 = 0x117
+    NPU_SET_OFM_ZERO_POINT = 0x118
+    NPU_SET_OFM_WIDTH0_M1 = 0x11A
+    NPU_SET_OFM_HEIGHT0_M1 = 0x11B
+    NPU_SET_OFM_HEIGHT1_M1 = 0x11C
+    NPU_SET_OFM_REGION = 0x11F
+    NPU_SET_KERNEL_WIDTH_M1 = 0x120
+    NPU_SET_KERNEL_HEIGHT_M1 = 0x121
+    NPU_SET_KERNEL_STRIDE = 0x122
+    NPU_SET_PARALLEL_MODE = 0x123
+    NPU_SET_ACC_FORMAT = 0x124
+    NPU_SET_ACTIVATION = 0x125
+    NPU_SET_ACTIVATION_MIN = 0x126
+    NPU_SET_ACTIVATION_MAX = 0x127
+    NPU_SET_WEIGHT_REGION = 0x128
+    NPU_SET_SCALE_REGION = 0x129
+    NPU_SET_AB_START = 0x12D
+    NPU_SET_BLOCKDEP = 0x12F
+    NPU_SET_DMA0_SRC_REGION = 0x130
+    NPU_SET_DMA0_DST_REGION = 0x131
+    NPU_SET_DMA0_SIZE0 = 0x132
+    NPU_SET_DMA0_SIZE1 = 0x133
+    NPU_SET_IFM2_BROADCAST = 0x180
+    NPU_SET_IFM2_SCALAR = 0x181
+    NPU_SET_IFM2_PRECISION = 0x185
+    NPU_SET_IFM2_ZERO_POINT = 0x189
+    NPU_SET_IFM2_WIDTH0_M1 = 0x18A
+    NPU_SET_IFM2_HEIGHT0_M1 = 0x18B
+    NPU_SET_IFM2_HEIGHT1_M1 = 0x18C
+    NPU_SET_IFM2_IB_START = 0x18D
+    NPU_SET_IFM2_REGION = 0x18F
+
+class cmd1(Enum):
+    NPU_SET_IFM_BASE0 = 0x000
+    NPU_SET_IFM_BASE1 = 0x001
+    NPU_SET_IFM_BASE2 = 0x002
+    NPU_SET_IFM_BASE3 = 0x003
+    NPU_SET_IFM_STRIDE_X = 0x004
+    NPU_SET_IFM_STRIDE_Y = 0x005
+    NPU_SET_IFM_STRIDE_C = 0x006
+    NPU_SET_OFM_BASE0 = 0x010
+    NPU_SET_OFM_BASE1 = 0x011
+    NPU_SET_OFM_BASE2 = 0x012
+    NPU_SET_OFM_BASE3 = 0x013
+    NPU_SET_OFM_STRIDE_X = 0x014
+    NPU_SET_OFM_STRIDE_Y = 0x015
+    NPU_SET_OFM_STRIDE_C = 0x016
+    NPU_SET_WEIGHT_BASE = 0x020
+    NPU_SET_WEIGHT_LENGTH = 0x021
+    NPU_SET_SCALE_BASE = 0x022
+    NPU_SET_SCALE_LENGTH = 0x023
+    NPU_SET_OFM_SCALE = 0x024
+    NPU_SET_OPA_SCALE = 0x025
+    NPU_SET_OPB_SCALE = 0x026
+    NPU_SET_DMA0_SRC = 0x030
+    NPU_SET_DMA0_DST = 0x031
+    NPU_SET_DMA0_LEN = 0x032
+    NPU_SET_DMA0_SKIP0 = 0x033
+    NPU_SET_DMA0_SKIP1 = 0x034
+    NPU_SET_IFM2_BASE0 = 0x080
+    NPU_SET_IFM2_BASE1 = 0x081
+    NPU_SET_IFM2_BASE2 = 0x082
+    NPU_SET_IFM2_BASE3 = 0x083
+    NPU_SET_IFM2_STRIDE_X = 0x084
+    NPU_SET_IFM2_STRIDE_Y = 0x085
+    NPU_SET_IFM2_STRIDE_C = 0x086
+    NPU_SET_WEIGHT1_BASE = 0x090
+    NPU_SET_WEIGHT1_LENGTH = 0x091
+    NPU_SET_SCALE1_BASE = 0x092
+    NPU_SET_SCALE1_LENGTH = 0x093
+
+class data_format(Enum):
+    NHWC = 0
+    NHCWB16 = 1
+
+class elementwise_mode(Enum):
+    MUL = 0
+    ADD = 1
+    SUB = 2
+    MIN = 3
+    MAX = 4
+    LRELU = 5
+    ABS = 6
+    CLZ = 7
+    SHR = 8
+    SHL = 9
+
+class ifm_precision(Enum):
+    W8_U8 = 0
+    W8_S8 = 1
+    W8_U16 = 4
+    W8_S16 = 5
+    W8_S32 = 9
+
+class ifm_scale_mode(Enum):
+    SCALE_16BIT = 0
+    SCALE_OPA_32BIT = 1
+    SCALE_OPB_32BIT = 2
+
+class memory_type(Enum):
+    AXI0_OUTSTANDING_COUNTER0 = 0
+    AXI0_OUTSTANDING_COUNTER1 = 1
+    AXI1_OUTSTANDING_COUNTER2 = 2
+    AXI1_OUTSTANDING_COUNTER3 = 3
+
+class ofm_precision(Enum):
+    U8 = 0
+    S8 = 1
+    U16 = 2
+    S16 = 3
+    S32 = 5
+
+class pmu_event_type(Enum):
+    CYCLE = 0x11
+    NPU_IDLE = 0x20
+    MAC_ACTIVE = 0x30
+    MAC_ACTIVE_8BIT = 0x31
+    MAC_ACTIVE_16BIT = 0x32
+    MAC_DPU_ACTIVE = 0x33
+    MAC_STALLED_BY_WD_ACC = 0x34
+    MAC_STALLED_BY_WD = 0x35
+    MAC_STALLED_BY_ACC = 0x36
+    MAC_STALLED_BY_IB = 0x37
+    AO_ACTIVE = 0x40
+    AO_ACTIVE_8BIT = 0x41
+    AO_ACTIVE_16BIT = 0x42
+    AO_STALLED_BY_OFMP_OB = 0x43
+    AO_STALLED_BY_OFMP = 0x44
+    AO_STALLED_BY_OB = 0x45
+    AO_STALLED_BY_ACC_IB = 0x46
+    AO_STALLED_BY_ACC = 0x47
+    AO_STALLED_BY_IB = 0x48
+    WD_ACTIVE = 0x50
+    WD_STALLED = 0x51
+    WD_STALLED_BY_WS = 0x52
+    WD_STALLED_BY_WD_BUF = 0x53
+    WD_PARSE_ACTIVE = 0x54
+    WD_PARSE_STALLED = 0x55
+    WD_PARSE_STALLED_IN = 0x56
+    WD_PARSE_STALLED_OUT = 0x57
+    AXI0_RD_TRANS_ACCEPTED = 0x80
+    AXI0_RD_TRANS_COMPLETED = 0x81
+    AXI0_RD_DATA_BEAT_RECEIVED = 0x82
+    AXI0_RD_TRAN_REQ_STALLED = 0x83
+    AXI0_WR_TRANS_ACCEPTED = 0x84
+    AXI0_WR_TRANS_COMPLETED_M = 0x85
+    AXI0_WR_TRANS_COMPLETED_S = 0x86
+    AXI0_WR_DATA_BEAT_WRITTEN = 0x87
+    AXI0_WR_TRAN_REQ_STALLED = 0x88
+    AXI0_WR_DATA_BEAT_STALLED = 0x89
+    AXI0_ENABLED_CYCLES = 0x8c
+    AXI0_RD_STALL_LIMIT = 0x8e
+    AXI0_WR_STALL_LIMIT = 0x8f
+    AXI1_RD_TRANS_ACCEPTED = 0x180
+    AXI1_RD_TRANS_COMPLETED = 0x181
+    AXI1_RD_DATA_BEAT_RECEIVED = 0x182
+    AXI1_RD_TRAN_REQ_STALLED = 0x183
+    AXI1_WR_TRANS_ACCEPTED = 0x184
+    AXI1_WR_TRANS_COMPLETED_M = 0x185
+    AXI1_WR_TRANS_COMPLETED_S = 0x186
+    AXI1_WR_DATA_BEAT_WRITTEN = 0x187
+    AXI1_WR_TRAN_REQ_STALLED = 0x188
+    AXI1_WR_DATA_BEAT_STALLED = 0x189
+    AXI1_ENABLED_CYCLES = 0x18c
+    AXI1_RD_STALL_LIMIT = 0x18e
+    AXI1_WR_STALL_LIMIT = 0x18f
+    AXI_LATENCY_ANY = 0xa0
+    AXI_LATENCY_32 = 0xa1
+    AXI_LATENCY_64 = 0xa2
+    AXI_LATENCY_128 = 0xa3
+    AXI_LATENCY_256 = 0xa4
+    AXI_LATENCY_512 = 0xa5
+    AXI_LATENCY_1024 = 0xa6
+
+class pooling_mode(Enum):
+    MAX = 0
+    AVERAGE = 1
+    REDUCE_SUM = 2
+
+class privilege_level(Enum):
+    USER = 0
+    PRIVILEGED = 1
+
+class product(Enum):
+    ETHOS_U55 = 0
+
+class resampling_mode(Enum):
+    NONE = 0
+    NEAREST = 1
+    TRANSPOSE = 2
+
+class rounding(Enum):
+    TFL = 0
+    TRUNCATE = 1
+    NATURAL = 2
+
+class security_level(Enum):
+    SECURE = 0
+    NON_SECURE = 1
+
+class state(Enum):
+    STOPPED = 0
+    RUNNING = 1
+
+class stride_mode(Enum):
+    STRIDE_MODE_1D = 0
+    STRIDE_MODE_2D = 1
+    STRIDE_MODE_3D = 2
+
+
+class clkforce_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("top_level_clk", c_uint32, 1),
+            ("cc_clk", c_uint32, 1),
+            ("dma_clk", c_uint32, 1),
+            ("mac_clk", c_uint32, 1),
+            ("ao_clk", c_uint32, 1),
+            ("wd_clk", c_uint32, 1),
+            ("reserved0", c_uint32, 26),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_top_level_clk(self, value): self.bits.top_level_clk = value
+    def get_top_level_clk(self): value = self.bits.top_level_clk; return value
+    def set_cc_clk(self, value): self.bits.cc_clk = value
+    def get_cc_clk(self): value = self.bits.cc_clk; return value
+    def set_dma_clk(self, value): self.bits.dma_clk = value
+    def get_dma_clk(self): value = self.bits.dma_clk; return value
+    def set_mac_clk(self, value): self.bits.mac_clk = value
+    def get_mac_clk(self): value = self.bits.mac_clk; return value
+    def set_ao_clk(self, value): self.bits.ao_clk = value
+    def get_ao_clk(self): value = self.bits.ao_clk; return value
+    def set_wd_clk(self, value): self.bits.wd_clk = value
+    def get_wd_clk(self): value = self.bits.wd_clk; return value
+
+
+class basep0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep2_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep3_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep4_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep5_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep6_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep7_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep8_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep9_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep10_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep11_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep12_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep13_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep14_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep15_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("addr_word", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_addr_word(self, value): self.bits.addr_word = value
+    def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class pid4_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid4", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid4(self, value): self.bits.pid4 = value
+    def get_pid4(self): value = self.bits.pid4; return value
+
+
+class pid5_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid5", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid5(self, value): self.bits.pid5 = value
+    def get_pid5(self): value = self.bits.pid5; return value
+
+
+class pid6_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid6", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid6(self, value): self.bits.pid6 = value
+    def get_pid6(self): value = self.bits.pid6; return value
+
+
+class pid7_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid7", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid7(self, value): self.bits.pid7 = value
+    def get_pid7(self): value = self.bits.pid7; return value
+
+
+class pid0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid0", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid0(self, value): self.bits.pid0 = value
+    def get_pid0(self): value = self.bits.pid0; return value
+
+
+class pid1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid1", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid1(self, value): self.bits.pid1 = value
+    def get_pid1(self): value = self.bits.pid1; return value
+
+
+class pid2_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid2", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid2(self, value): self.bits.pid2 = value
+    def get_pid2(self): value = self.bits.pid2; return value
+
+
+class pid3_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pid3", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pid3(self, value): self.bits.pid3 = value
+    def get_pid3(self): value = self.bits.pid3; return value
+
+
+class cid0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cid0", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cid0(self, value): self.bits.cid0 = value
+    def get_cid0(self): value = self.bits.cid0; return value
+
+
+class cid1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cid1", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cid1(self, value): self.bits.cid1 = value
+    def get_cid1(self): value = self.bits.cid1; return value
+
+
+class cid2_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cid2", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cid2(self, value): self.bits.cid2 = value
+    def get_cid2(self): value = self.bits.cid2; return value
+
+
+class cid3_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cid3", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cid3(self, value): self.bits.cid3 = value
+    def get_cid3(self): value = self.bits.cid3; return value
+
+
+class id_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("version_status", c_uint32, 4),
+            ("version_minor", c_uint32, 4),
+            ("version_major", c_uint32, 4),
+            ("product_major", c_uint32, 4),
+            ("arch_patch_rev", c_uint32, 4),
+            ("arch_minor_rev", c_uint32, 8),
+            ("arch_major_rev", c_uint32, 4),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_version_status(self, value): self.bits.version_status = value
+    def get_version_status(self): value = self.bits.version_status; return value
+    def set_version_minor(self, value): self.bits.version_minor = value
+    def get_version_minor(self): value = self.bits.version_minor; return value
+    def set_version_major(self, value): self.bits.version_major = value
+    def get_version_major(self): value = self.bits.version_major; return value
+    def set_product_major(self, value): self.bits.product_major = value
+    def get_product_major(self): value = self.bits.product_major; return value
+    def set_arch_patch_rev(self, value): self.bits.arch_patch_rev = value
+    def get_arch_patch_rev(self): value = self.bits.arch_patch_rev; return value
+    def set_arch_minor_rev(self, value): self.bits.arch_minor_rev = value
+    def get_arch_minor_rev(self): value = self.bits.arch_minor_rev; return value
+    def set_arch_major_rev(self, value): self.bits.arch_major_rev = value
+    def get_arch_major_rev(self): value = self.bits.arch_major_rev; return value
+
+
+class status_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("state", c_uint32, 1),
+            ("irq_raised", c_uint32, 1),
+            ("bus_status", c_uint32, 1),
+            ("reset_status", c_uint32, 1),
+            ("cmd_parse_error", c_uint32, 1),
+            ("cmd_end_reached", c_uint32, 1),
+            ("pmu_irq_raised", c_uint32, 1),
+            ("wd_fault", c_uint32, 1),
+            ("reserved0", c_uint32, 3),
+            ("faulting_interface", c_uint32, 1),
+            ("faulting_channel", c_uint32, 4),
+            ("irq_history_mask", c_uint32, 16),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_state(self, value): self.bits.state = value
+    def get_state(self): value = self.bits.state; return value
+    def set_irq_raised(self, value): self.bits.irq_raised = value
+    def get_irq_raised(self): value = self.bits.irq_raised; return value
+    def set_bus_status(self, value): self.bits.bus_status = value
+    def get_bus_status(self): value = self.bits.bus_status; return value
+    def set_reset_status(self, value): self.bits.reset_status = value
+    def get_reset_status(self): value = self.bits.reset_status; return value
+    def set_cmd_parse_error(self, value): self.bits.cmd_parse_error = value
+    def get_cmd_parse_error(self): value = self.bits.cmd_parse_error; return value
+    def set_cmd_end_reached(self, value): self.bits.cmd_end_reached = value
+    def get_cmd_end_reached(self): value = self.bits.cmd_end_reached; return value
+    def set_pmu_irq_raised(self, value): self.bits.pmu_irq_raised = value
+    def get_pmu_irq_raised(self): value = self.bits.pmu_irq_raised; return value
+    def set_wd_fault(self, value): self.bits.wd_fault = value
+    def get_wd_fault(self): value = self.bits.wd_fault; return value
+    def set_faulting_interface(self, value): self.bits.faulting_interface = value
+    def get_faulting_interface(self): value = self.bits.faulting_interface; return value
+    def set_faulting_channel(self, value): self.bits.faulting_channel = value
+    def get_faulting_channel(self): value = self.bits.faulting_channel; return value
+    def set_irq_history_mask(self, value): self.bits.irq_history_mask = value
+    def get_irq_history_mask(self): value = self.bits.irq_history_mask; return value
+
+
+class cmd_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("transition_to_running_state", c_uint32, 1),
+            ("clear_irq", c_uint32, 1),
+            ("clock_q_enable", c_uint32, 1),
+            ("power_q_enable", c_uint32, 1),
+            ("stop_request", c_uint32, 1),
+            ("reserved0", c_uint32, 11),
+            ("clear_irq_history", c_uint32, 16),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_transition_to_running_state(self, value): self.bits.transition_to_running_state = value
+    def get_transition_to_running_state(self): value = self.bits.transition_to_running_state; return value
+    def set_clear_irq(self, value): self.bits.clear_irq = value
+    def get_clear_irq(self): value = self.bits.clear_irq; return value
+    def set_clock_q_enable(self, value): self.bits.clock_q_enable = value
+    def get_clock_q_enable(self): value = self.bits.clock_q_enable; return value
+    def set_power_q_enable(self, value): self.bits.power_q_enable = value
+    def get_power_q_enable(self): value = self.bits.power_q_enable; return value
+    def set_stop_request(self, value): self.bits.stop_request = value
+    def get_stop_request(self): value = self.bits.stop_request; return value
+    def set_clear_irq_history(self, value): self.bits.clear_irq_history = value
+    def get_clear_irq_history(self): value = self.bits.clear_irq_history; return value
+
+
+class reset_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("pending_cpl", c_uint32, 1),
+            ("pending_csl", c_uint32, 1),
+            ("reserved0", c_uint32, 30),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_pending_cpl(self, value): self.bits.pending_cpl = value
+    def get_pending_cpl(self): value = self.bits.pending_cpl; return value
+    def set_pending_csl(self, value): self.bits.pending_csl = value
+    def get_pending_csl(self): value = self.bits.pending_csl; return value
+
+
+class qbase0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("qbase0", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_qbase0(self, value): self.bits.qbase0 = value
+    def get_qbase0(self): value = self.bits.qbase0; return value
+
+
+class qbase1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("qbase1", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_qbase1(self, value): self.bits.qbase1 = value
+    def get_qbase1(self): value = self.bits.qbase1; return value
+
+
+class qread_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("qread", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_qread(self, value): self.bits.qread = value
+    def get_qread(self): value = self.bits.qread; return value
+
+
+class qconfig_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("qconfig", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_qconfig(self, value): self.bits.qconfig = value
+    def get_qconfig(self): value = self.bits.qconfig; return value
+
+
+class qsize_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("qsize", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_qsize(self, value): self.bits.qsize = value
+    def get_qsize(self): value = self.bits.qsize; return value
+
+
+class prot_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("active_cpl", c_uint32, 1),
+            ("active_csl", c_uint32, 1),
+            ("reserved0", c_uint32, 30),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_active_cpl(self, value): self.bits.active_cpl = value
+    def get_active_cpl(self): value = self.bits.active_cpl; return value
+    def set_active_csl(self, value): self.bits.active_csl = value
+    def get_active_csl(self): value = self.bits.active_csl; return value
+
+
+class config_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("macs_per_cc", c_uint32, 4),
+            ("cmd_stream_version", c_uint32, 4),
+            ("shram_size", c_uint32, 8),
+            ("reserved0", c_uint32, 12),
+            ("product", c_uint32, 4),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_macs_per_cc(self, value): self.bits.macs_per_cc = value
+    def get_macs_per_cc(self): value = self.bits.macs_per_cc; return value
+    def set_cmd_stream_version(self, value): self.bits.cmd_stream_version = value
+    def get_cmd_stream_version(self): value = self.bits.cmd_stream_version; return value
+    def set_shram_size(self, value): self.bits.shram_size = value
+    def get_shram_size(self): value = self.bits.shram_size; return value
+    def set_product(self, value): self.bits.product = value
+    def get_product(self): value = self.bits.product; return value
+
+
+class lock_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("lock", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_lock(self, value): self.bits.lock = value
+    def get_lock(self): value = self.bits.lock; return value
+
+
+class regioncfg_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("region0", c_uint32, 2),
+            ("region1", c_uint32, 2),
+            ("region2", c_uint32, 2),
+            ("region3", c_uint32, 2),
+            ("region4", c_uint32, 2),
+            ("region5", c_uint32, 2),
+            ("region6", c_uint32, 2),
+            ("region7", c_uint32, 2),
+            ("reserved0", c_uint32, 16),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_region0(self, value): self.bits.region0 = value
+    def get_region0(self): value = self.bits.region0; return value
+    def set_region1(self, value): self.bits.region1 = value
+    def get_region1(self): value = self.bits.region1; return value
+    def set_region2(self, value): self.bits.region2 = value
+    def get_region2(self): value = self.bits.region2; return value
+    def set_region3(self, value): self.bits.region3 = value
+    def get_region3(self): value = self.bits.region3; return value
+    def set_region4(self, value): self.bits.region4 = value
+    def get_region4(self): value = self.bits.region4; return value
+    def set_region5(self, value): self.bits.region5 = value
+    def get_region5(self): value = self.bits.region5; return value
+    def set_region6(self, value): self.bits.region6 = value
+    def get_region6(self): value = self.bits.region6; return value
+    def set_region7(self, value): self.bits.region7 = value
+    def get_region7(self): value = self.bits.region7; return value
+
+
+class axi_limit0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("max_beats", c_uint32, 2),
+            ("reserved0", c_uint32, 2),
+            ("memtype", c_uint32, 4),
+            ("reserved1", c_uint32, 8),
+            ("max_outstanding_read_m1", c_uint32, 8),
+            ("max_outstanding_write_m1", c_uint32, 8),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_max_beats(self, value): self.bits.max_beats = value
+    def get_max_beats(self): value = self.bits.max_beats; return value
+    def set_memtype(self, value): self.bits.memtype = value
+    def get_memtype(self): value = self.bits.memtype; return value
+    def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+    def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+    def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+    def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("max_beats", c_uint32, 2),
+            ("reserved0", c_uint32, 2),
+            ("memtype", c_uint32, 4),
+            ("reserved1", c_uint32, 8),
+            ("max_outstanding_read_m1", c_uint32, 8),
+            ("max_outstanding_write_m1", c_uint32, 8),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_max_beats(self, value): self.bits.max_beats = value
+    def get_max_beats(self): value = self.bits.max_beats; return value
+    def set_memtype(self, value): self.bits.memtype = value
+    def get_memtype(self): value = self.bits.memtype; return value
+    def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+    def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+    def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+    def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit2_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("max_beats", c_uint32, 2),
+            ("reserved0", c_uint32, 2),
+            ("memtype", c_uint32, 4),
+            ("reserved1", c_uint32, 8),
+            ("max_outstanding_read_m1", c_uint32, 8),
+            ("max_outstanding_write_m1", c_uint32, 8),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_max_beats(self, value): self.bits.max_beats = value
+    def get_max_beats(self): value = self.bits.max_beats; return value
+    def set_memtype(self, value): self.bits.memtype = value
+    def get_memtype(self): value = self.bits.memtype; return value
+    def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+    def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+    def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+    def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit3_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("max_beats", c_uint32, 2),
+            ("reserved0", c_uint32, 2),
+            ("memtype", c_uint32, 4),
+            ("reserved1", c_uint32, 8),
+            ("max_outstanding_read_m1", c_uint32, 8),
+            ("max_outstanding_write_m1", c_uint32, 8),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_max_beats(self, value): self.bits.max_beats = value
+    def get_max_beats(self): value = self.bits.max_beats; return value
+    def set_memtype(self, value): self.bits.memtype = value
+    def get_memtype(self): value = self.bits.memtype; return value
+    def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+    def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+    def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+    def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class pmcr_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cnt_en", c_uint32, 1),
+            ("event_cnt_rst", c_uint32, 1),
+            ("cycle_cnt_rst", c_uint32, 1),
+            ("mask_en", c_uint32, 1),
+            ("reserved0", c_uint32, 7),
+            ("num_event_cnt", c_uint32, 5),
+            ("reserved1", c_uint32, 16),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cnt_en(self, value): self.bits.cnt_en = value
+    def get_cnt_en(self): value = self.bits.cnt_en; return value
+    def set_event_cnt_rst(self, value): self.bits.event_cnt_rst = value
+    def get_event_cnt_rst(self): value = self.bits.event_cnt_rst; return value
+    def set_cycle_cnt_rst(self, value): self.bits.cycle_cnt_rst = value
+    def get_cycle_cnt_rst(self): value = self.bits.cycle_cnt_rst; return value
+    def set_mask_en(self, value): self.bits.mask_en = value
+    def get_mask_en(self): value = self.bits.mask_en; return value
+    def set_num_event_cnt(self, value): self.bits.num_event_cnt = value
+    def get_num_event_cnt(self): value = self.bits.num_event_cnt; return value
+
+
+class pmcntenset_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0", c_uint32, 1),
+            ("event_cnt_1", c_uint32, 1),
+            ("event_cnt_2", c_uint32, 1),
+            ("event_cnt_3", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value
+    def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value
+    def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value
+    def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value
+    def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value
+    def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value
+    def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value
+    def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value
+    def set_cycle_cnt(self, value): self.bits.cycle_cnt = value
+    def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value
+
+
+class pmcntenclr_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0", c_uint32, 1),
+            ("event_cnt_1", c_uint32, 1),
+            ("event_cnt_2", c_uint32, 1),
+            ("event_cnt_3", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value
+    def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value
+    def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value
+    def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value
+    def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value
+    def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value
+    def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value
+    def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value
+    def set_cycle_cnt(self, value): self.bits.cycle_cnt = value
+    def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value
+
+
+class pmovsset_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0_ovf", c_uint32, 1),
+            ("event_cnt_1_ovf", c_uint32, 1),
+            ("event_cnt_2_ovf", c_uint32, 1),
+            ("event_cnt_3_ovf", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt_ovf", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value
+    def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value
+    def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value
+    def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value
+    def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value
+    def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value
+    def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value
+    def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value
+    def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value
+    def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value
+
+
+class pmovsclr_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0_ovf", c_uint32, 1),
+            ("event_cnt_1_ovf", c_uint32, 1),
+            ("event_cnt_2_ovf", c_uint32, 1),
+            ("event_cnt_3_ovf", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt_ovf", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value
+    def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value
+    def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value
+    def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value
+    def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value
+    def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value
+    def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value
+    def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value
+    def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value
+    def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value
+
+
+class pmintset_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0_int", c_uint32, 1),
+            ("event_cnt_1_int", c_uint32, 1),
+            ("event_cnt_2_int", c_uint32, 1),
+            ("event_cnt_3_int", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt_int", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value
+    def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value
+    def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value
+    def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value
+    def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value
+    def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value
+    def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value
+    def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value
+    def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value
+    def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value
+
+
+class pmintclr_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("event_cnt_0_int", c_uint32, 1),
+            ("event_cnt_1_int", c_uint32, 1),
+            ("event_cnt_2_int", c_uint32, 1),
+            ("event_cnt_3_int", c_uint32, 1),
+            ("reserved0", c_uint32, 27),
+            ("cycle_cnt_int", c_uint32, 1),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value
+    def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value
+    def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value
+    def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value
+    def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value
+    def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value
+    def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value
+    def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value
+    def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value
+    def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value
+
+
+class pmccntr_lo_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cycle_cnt_lo", c_uint32, 32),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cycle_cnt_lo(self, value): self.bits.cycle_cnt_lo = value
+    def get_cycle_cnt_lo(self): value = self.bits.cycle_cnt_lo; return value
+
+
+class pmccntr_hi_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cycle_cnt_hi", c_uint32, 16),
+            ("reserved0", c_uint32, 16),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cycle_cnt_hi(self, value): self.bits.cycle_cnt_hi = value
+    def get_cycle_cnt_hi(self): value = self.bits.cycle_cnt_hi; return value
+
+
+class pmccntr_cfg_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("cycle_cnt_cfg_start", c_uint32, 10),
+            ("reserved0", c_uint32, 6),
+            ("cycle_cnt_cfg_stop", c_uint32, 10),
+            ("reserved1", c_uint32, 6),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_cycle_cnt_cfg_start(self, value): self.bits.cycle_cnt_cfg_start = value
+    def get_cycle_cnt_cfg_start(self): value = self.bits.cycle_cnt_cfg_start; return value
+    def set_cycle_cnt_cfg_stop(self, value): self.bits.cycle_cnt_cfg_stop = value
+    def get_cycle_cnt_cfg_stop(self): value = self.bits.cycle_cnt_cfg_stop; return value
+
+
+class pmcaxi_chan_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("axi_chan", c_uint32, 4),
+            ("reserved0", c_uint32, 3),
+            ("rw", c_uint32, 1),
+            ("axi_cnt", c_uint32, 2),
+            ("reserved1", c_uint32, 22),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_axi_chan(self, value): self.bits.axi_chan = value
+    def get_axi_chan(self): value = self.bits.axi_chan; return value
+    def set_rw(self, value): self.bits.rw = value
+    def get_rw(self): value = self.bits.rw; return value
+    def set_axi_cnt(self, value): self.bits.axi_cnt = value
+    def get_axi_cnt(self): value = self.bits.axi_cnt; return value
+
+
+class pmevtyper0_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("ev_type", c_uint32, 10),
+            ("reserved0", c_uint32, 22),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_ev_type(self, value): self.bits.ev_type = value
+    def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper1_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("ev_type", c_uint32, 10),
+            ("reserved0", c_uint32, 22),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_ev_type(self, value): self.bits.ev_type = value
+    def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper2_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("ev_type", c_uint32, 10),
+            ("reserved0", c_uint32, 22),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_ev_type(self, value): self.bits.ev_type = value
+    def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper3_r(Union):
+    class _bitfield(Structure):
+        _fields_ = [
+            ("ev_type", c_uint32, 10),
+            ("reserved0", c_uint32, 22),
+        ]
+    _fields_ = [("bits", _bitfield),
+                ("word", c_uint32)]
+    def set_ev_type(self, value): self.bits.ev_type = value
+    def get_ev_type(self): value = self.bits.ev_type; return value
+
+class command_no_payload_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class command_with_payload_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("param", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_op_stop_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("mask", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_STOP and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_mask(self): return mask
+    def set_mask(self, value): mask = value
+
+class npu_op_irq_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("mask", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_IRQ and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_mask(self): return mask
+    def set_mask(self, value): mask = value
+
+class npu_op_conv_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("reserved0", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_CONV and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_depthwise_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("reserved0", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_DEPTHWISE and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_pool_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("mode", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_POOL and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_mode(self): return mode
+    def set_mode(self, value): mode = value
+
+class npu_op_elementwise_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("mode", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_ELEMENTWISE and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_mode(self): return mode
+    def set_mode(self, value): mode = value
+
+class npu_op_dma_start_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("channel_mode", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_DMA_START and must_be_zero0==0;
+    def get_channel_mode(self): return channel_mode
+    def set_channel_mode(self, value): channel_mode = value
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_dma_wait_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("reserved0", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_DMA_WAIT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_kernel_wait_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_KERNEL_WAIT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_op_pmu_mask_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_OP_PMU_MASK and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_pad_top_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_TOP and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_pad_left_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_LEFT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_pad_right_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_RIGHT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_pad_bottom_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_BOTTOM and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_depth_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_DEPTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_precision_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 4),
+        ("reserved0", c_uint32, 2),
+        ("format", c_uint32, 2),
+        ("scale_mode", c_uint32, 2),
+        ("reserved1", c_uint32, 4),
+        ("round_mode", c_uint32, 2),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PRECISION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_format(self): return format
+    def set_format(self, value): format = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_round_mode(self): return round_mode
+    def set_round_mode(self, value): round_mode = value
+    def get_scale_mode(self): return scale_mode
+    def set_scale_mode(self, value): scale_mode = value
+
+class npu_set_ifm_upscale_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("mode", c_uint32, 2),
+        ("reserved0", c_uint32, 14),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_UPSCALE and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_mode(self): return mode
+    def set_mode(self, value): mode = value
+
+class npu_set_ifm_zero_point_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_ZERO_POINT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_width0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_WIDTH0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_height0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_height1_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT1_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_ib_end_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_IB_END and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_width_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_height_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_depth_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_DEPTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_precision_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("precision", c_uint32, 3),
+        ("reserved0", c_uint32, 3),
+        ("format", c_uint32, 2),
+        ("scaling", c_uint32, 1),
+        ("reserved1", c_uint32, 5),
+        ("rounding", c_uint32, 2),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_PRECISION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_format(self): return format
+    def set_format(self, value): format = value
+    def get_precision(self): return precision
+    def set_precision(self, value): precision = value
+    def get_rounding(self): return rounding
+    def set_rounding(self, value): rounding = value
+    def get_scaling(self): return scaling
+    def set_scaling(self, value): scaling = value
+
+class npu_set_ofm_blk_width_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_WIDTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_blk_height_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_HEIGHT_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_blk_depth_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_DEPTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_zero_point_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_ZERO_POINT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_width0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_height0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_height1_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT1_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ofm_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_OFM_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_kernel_width_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_WIDTH_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_kernel_height_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_HEIGHT_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_kernel_stride_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_STRIDE and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_parallel_mode_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_PARALLEL_MODE and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_acc_format_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_ACC_FORMAT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_activation_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("type", c_uint32, 12),
+        ("act_clip_range", c_uint32, 4),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION and must_be_zero0==0;
+    def get_act_clip_range(self): return act_clip_range
+    def set_act_clip_range(self, value): act_clip_range = value
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_type(self): return type
+    def set_type(self, value): type = value
+
+class npu_set_activation_min_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MIN and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_activation_max_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MAX and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_weight_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_WEIGHT_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_scale_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_SCALE_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ab_start_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_AB_START and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_blockdep_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_BLOCKDEP and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_dma0_src_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("region", c_uint32, 8),
+        ("internal", c_uint32, 1),
+        ("stride_mode", c_uint32, 2),
+        ("reserved0", c_uint32, 5),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SRC_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_internal(self): return internal
+    def set_internal(self, value): internal = value
+    def get_region(self): return region
+    def set_region(self, value): region = value
+    def get_stride_mode(self): return stride_mode
+    def set_stride_mode(self, value): stride_mode = value
+
+class npu_set_dma0_dst_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("region", c_uint32, 8),
+        ("internal", c_uint32, 1),
+        ("stride_mode", c_uint32, 2),
+        ("reserved0", c_uint32, 5),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_DST_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_internal(self): return internal
+    def set_internal(self, value): internal = value
+    def get_region(self): return region
+    def set_region(self, value): region = value
+    def get_stride_mode(self): return stride_mode
+    def set_stride_mode(self, value): stride_mode = value
+
+class npu_set_dma0_size0_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE0 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_dma0_size1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_broadcast_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("broadcast_height", c_uint32, 1),
+        ("broadcast_width", c_uint32, 1),
+        ("broadcast_depth", c_uint32, 1),
+        ("reserved0", c_uint32, 3),
+        ("operand_order", c_uint32, 1),
+        ("broadcast_scalar", c_uint32, 1),
+        ("reserved1", c_uint32, 8),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_BROADCAST and must_be_zero0==0;
+    def get_broadcast_depth(self): return broadcast_depth
+    def set_broadcast_depth(self, value): broadcast_depth = value
+    def get_broadcast_height(self): return broadcast_height
+    def set_broadcast_height(self, value): broadcast_height = value
+    def get_broadcast_scalar(self): return broadcast_scalar
+    def set_broadcast_scalar(self, value): broadcast_scalar = value
+    def get_broadcast_width(self): return broadcast_width
+    def set_broadcast_width(self, value): broadcast_width = value
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_operand_order(self): return operand_order
+    def set_operand_order(self, value): operand_order = value
+
+class npu_set_ifm2_scalar_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_SCALAR and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_precision_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 4),
+        ("reserved0", c_uint32, 2),
+        ("format", c_uint32, 2),
+        ("reserved1", c_uint32, 8),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_PRECISION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_format(self): return format
+    def set_format(self, value): format = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_zero_point_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_ZERO_POINT and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_width0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_WIDTH0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_height0_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT0_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_height1_m1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT1_M1 and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_ib_start_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_IB_START and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm2_region_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero0", c_uint32, 6),
+        ("param", c_uint32, 16),
+    ]
+    def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_REGION and must_be_zero0==0;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+
+class npu_set_ifm_base0_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base2_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base3_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_x_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_y_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_c_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base0_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base2_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base3_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_x_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_y_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_c_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight_base_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight_length_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale_base_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale_length_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_scale_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("shift", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OFM_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+    def get_shift(self): return shift
+    def set_shift(self, value): shift = value
+
+class npu_set_opa_scale_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("shift", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OPA_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+    def get_shift(self): return shift
+    def set_shift(self, value): shift = value
+
+class npu_set_opb_scale_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_OPB_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_src_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SRC and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_dst_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_DST and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_len_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_LEN and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_skip0_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("param", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_skip1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("param", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base0_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base1_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base2_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base3_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_x_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_y_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_c_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight1_base_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("param", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight1_length_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale1_base_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("param", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_param(self): return param
+    def set_param(self, value): param = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale1_length_t(Structure):
+    _fields_ = [
+        ("cmd_code", c_uint32, 10),
+        ("must_be_zero", c_uint32, 4),
+        ("payload_size", c_uint32, 2),
+        ("reserved0", c_uint32, 16),
+        ("data", c_uint32, 32),
+    ]
+    def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+    def get_cmd_code(self): return cmd_code
+    def set_cmd_code(self, value): cmd_code = value
+    def get_data(self): return data
+    def set_data(self, value): data = value
+    def get_payload_size(self): return payload_size
+    def set_payload_size(self, value): payload_size = value
diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
new file mode 100644
index 0000000..5b9ba8b
--- /dev/null
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -0,0 +1,253 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
+# untouched in the final output.
+#
+# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
+# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
+# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
+
+from .nn_graph import Pass, PassPlacement, NpuBlockType, Subgraph
+from .operation import Operation
+import numpy as np
+
+
+def make_npu_call_op_pass(npu_subgraph):
+    op = Operation("NpuOp", "call_" + npu_subgraph.name)
+    op.attrs["subgraph"] = npu_subgraph
+    ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
+    ps.ops = [op]
+    ps.primary_op = op
+    op.attrs["npu_block_type"] = ps.npu_block_type
+    op.scheduled_pass = ps
+
+    # Inputs and outputs filled in later as we cut the graphs
+    return ps
+
+
+def switch_tensor_for_op(op, orig_tens, new_tens):
+
+    op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
+    op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
+
+    ps = op.scheduled_pass
+    if ps is None:
+        return
+
+    ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
+    ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
+
+    if ps.ifm_tensor == orig_tens:
+        ps.ifm_tensor = new_tens
+    if ps.ifm2_tensor == orig_tens:
+        ps.ifm2_tensor = new_tens
+    if ps.ofm_tensor == orig_tens:
+        ps.ofm_tensor = new_tens
+    if ps.weight_tensor == orig_tens:
+        ps.weight_tensor = new_tens
+    if ps.scale_tensor == orig_tens:
+        ps.scale_tensor = new_tens
+
+
+def rewrite_tensor_cpu_producer_npu_consumers(
+    orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
+):
+    is_const = orig_tens.ops[0].type == "Const"
+
+    new_tens = orig_tens.clone("_npu")
+    orig_tens.npu_tensor = new_tens
+    new_tens.cpu_tensor = orig_tens
+
+    op_type = "SubgraphInput"
+    if is_const:
+        op_type = "Const"
+    op = Operation(op_type, orig_tens.name + "_input")
+    op.attrs["npu_block_type"] = NpuBlockType.Default
+    op.outputs = [new_tens]
+    op.scheduled_pass = startup_init_ps
+    new_tens.ops = [op]
+    startup_init_ps.ops.append(op)
+    startup_init_ps.outputs.append(new_tens)
+
+    if not is_const:
+        call_ps.inputs.append(orig_tens)
+        call_ps.primary_op.inputs.append(orig_tens)
+
+    for op in list(orig_tens.consumers()):
+        if op is None:
+            continue  # Subgraph consumers handled separately.
+        ps = op.scheduled_pass
+        if subgraph_for_pass[ps] == npu_subgraph:
+            switch_tensor_for_op(op, orig_tens, new_tens)
+            orig_tens.consumer_list.remove(op)
+            new_tens.consumer_list.append(op)
+
+    # Deal with output tensors for the NPU graph. These are special.
+    npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
+
+
+def rewrite_tensor_npu_producer_cpu_consumers(
+    orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
+):
+
+    new_tens = orig_tens.clone("_cpu")
+    new_tens.npu_tensor = orig_tens
+    orig_tens.cpu_tensor = new_tens
+
+    npu_subgraph.output_tensors.append(orig_tens)
+
+    call_ps.outputs.append(new_tens)
+    call_ps.primary_op.outputs.append(new_tens)
+    new_tens.ops = [call_ps.primary_op]
+
+    for op in list(orig_tens.consumers()):
+        if op is None:
+            continue  # Subgraph consumers handled separately.
+        ps = op.scheduled_pass
+        if subgraph_for_pass[ps] != npu_subgraph:
+            switch_tensor_for_op(op, orig_tens, new_tens)
+            orig_tens.consumer_list.remove(op)
+            new_tens.consumer_list.append(op)
+
+    # Deal with output tensors for the CPU graph. These are special.
+    cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
+
+
+def extract_subgraph(nng, orig_sg, arch):
+    assert orig_sg.placement == PassPlacement.Cpu
+
+    passes = list(orig_sg.passes)
+    place_vec = np.array([ps.placement for ps in passes])
+    place_vec[
+        place_vec == PassPlacement.StartupInit
+    ] = PassPlacement.Cpu  # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
+
+    # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
+    # passes should be assigned to the NPU.
+
+    # Forward, then backwards
+    for is_reversed in range(2):
+        last_place = PassPlacement.Cpu
+        seq = enumerate(place_vec)
+        if is_reversed:
+            seq = reversed(list(seq))
+        for idx, place in seq:
+            if place == PassPlacement.MemoryOnly:
+                if last_place == PassPlacement.Npu:
+                    place = PassPlacement.Npu
+                    place_vec[idx] = place
+
+            if place != PassPlacement.MemoryOnly:
+                last_place = place
+
+    # Anything left, assign to the CPU.
+    place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
+
+    if np.all(place_vec == PassPlacement.Cpu):
+        return []  # Nothing to do
+
+    # Create the subgraphs and split passes between them
+
+    new_subgraphs = []
+    split_count = 0
+    subgraph_for_pass = {}
+    orig_sg.passes = []
+    call_pass = {}
+    startup_init_passes = {}
+
+    last_place = PassPlacement.Cpu
+    curr_sg = orig_sg
+
+    for idx, place in enumerate(place_vec):
+        if place != last_place:
+            if place == PassPlacement.Npu:
+                split_count += 1
+                curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
+                new_subgraphs.append(curr_sg)
+                call_ps = make_npu_call_op_pass(curr_sg)
+                subgraph_for_pass[call_ps] = orig_sg
+                orig_sg.passes.append(call_ps)
+                call_pass[curr_sg] = call_ps
+
+                startup_init_ps = Pass(
+                    curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
+                )
+                curr_sg.passes.append(startup_init_ps)
+                startup_init_passes[curr_sg] = startup_init_ps
+                subgraph_for_pass[startup_init_ps] = curr_sg
+
+            else:
+                curr_sg = orig_sg
+            last_place = place
+        ps = passes[idx]
+        subgraph_for_pass[ps] = curr_sg
+        curr_sg.passes.append(ps)
+
+    # Rewrite tensors to fix up graphs.
+
+    for curr_sg in new_subgraphs:
+        for ps in curr_sg.passes:
+            for tens in ps.inputs:
+                source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
+                assert len(source_sgs) >= 0
+                producer_sg = source_sgs[0]
+                for sg in source_sgs:
+                    assert sg == producer_sg  # All need to be the same.
+
+                if producer_sg != curr_sg:
+                    assert (
+                        producer_sg == orig_sg
+                    )  # Because we go in-order, all the producers must be the original graph.
+                    rewrite_tensor_cpu_producer_npu_consumers(
+                        tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
+                    )
+
+            for tens in ps.outputs:
+
+                dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
+                need_rewrite = False
+                for sg in dest_sgs:
+                    if sg != curr_sg:
+                        need_rewrite = True
+                        break
+                if tens in orig_sg.output_tensors:
+                    need_rewrite = True
+
+                if need_rewrite:
+                    rewrite_tensor_npu_producer_cpu_consumers(
+                        tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
+                    )
+
+    return new_subgraphs
+
+
+def extract_npu_subgraphs(nng, arch):
+
+    nng.refresh_after_modification()
+
+    for sg in list(nng.subgraphs):
+        if sg.placement == PassPlacement.Cpu:
+            new_subgraphs = extract_subgraph(nng, sg, arch)
+            nng.subgraphs += new_subgraphs
+
+    nng.refresh_after_modification()
+    nng.prune_startup_init_pass()
+
+    for sg in nng.subgraphs:
+        sg.build_pass_links()
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
new file mode 100644
index 0000000..f0afcf8
--- /dev/null
+++ b/ethosu/vela/graph_optimiser.py
@@ -0,0 +1,485 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are
+# split into two parts optimise_graph_a and optimise_graph_b.
+
+from .nn_graph import Operation, NpuBlockType, Tensor
+from . import rewrite_graph
+from .data_type import BaseType, DataType
+import numpy as np
+import math
+from .numeric_util import round_up_divide
+
+passthrough_nodes = set(("Identity",))
+
+
+def remove_passthrough_tensor(tens, arch):
+    if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
+        assert len(tens.ops[0].inputs) == 1
+        tens = tens.ops[0].inputs[0]
+    return tens
+
+
+def rewrite_concat(tens, arch):
+    if len(tens.ops) == 1 and tens.ops[0].is_concat_op():
+        concat_op = tens.ops[0]
+        if tens != concat_op.outputs[0]:
+            return tens  # don't attempt to rewrite the min/max outputs of QuantizedConcat
+
+        # Not supported so leave it and run on CPU
+        if not concat_op.run_on_npu:
+            return tens
+
+        inputs, axis = concat_op.get_concat_inputs_axis()
+
+        tens.ops = []
+        offset = 0
+        for idx, inp in enumerate(inputs):
+            new_op = Operation("ConcatSliceWrite", concat_op.name + str(idx))
+            new_op.inputs = [inp]
+            new_op.outputs = [tens]
+            new_op.attrs["concat_axis"] = axis
+            new_op.attrs["concat_start"] = offset
+            offset += inp.shape[axis]
+            new_op.attrs["concat_end"] = offset
+            new_op.run_on_npu = True
+            tens.ops.append(new_op)
+        assert tens.shape[axis] == offset
+
+    return tens
+
+
+def rewrite_split(tens, arch):
+
+    if len(tens.ops) == 1 and tens.ops[0].is_split_op():
+        split_op = tens.ops[0]
+
+        # Not supported so leave it and run on CPU
+        if not split_op.run_on_npu:
+            return tens
+
+        inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
+
+        tens.ops = []
+        new_op = Operation("SplitSliceRead", split_op.name)
+        new_op.inputs = [inp]
+        new_op.outputs = [tens]
+
+        # For Split the offset cannot be extracted from the tensor so it has to
+        # be calculated from the index of the output tensor
+        if axis != None:
+            # Get the start and end of the split
+            offset_start = [0] * len(tens.shape)
+            offset_end = [0] * len(tens.shape)
+            for out in outputs:
+                if out == tens:
+                    break
+                offset_start[axis] += out.shape[axis]
+
+            offset_end[axis] = offset_start[axis] + tens.shape[axis]
+
+        new_op.attrs["split_start"] = offset_start
+        new_op.attrs["split_end"] = offset_end
+        new_op.run_on_npu = True
+        tens.ops.append(new_op)
+
+    return tens
+
+
+def needed_total_padding(input_size, stride, filter_size):
+    out_size = (input_size + stride - 1) // stride
+    needed_input = (out_size - 1) * stride + filter_size
+    total_padding = max(0, needed_input - input_size)
+    return total_padding
+
+
+def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims):
+    ypad = needed_total_padding(int(input_dims[1]), int(stride[1]), int(kernel_size[0]))
+    xpad = needed_total_padding(int(input_dims[2]), int(stride[2]), int(kernel_size[1]))
+    if padding_type == b"SAME":
+        left_pad = (xpad + 0) // 2
+        right_pad = (xpad + 1) // 2
+        top_pad = (ypad + 0) // 2
+        bottom_pad = (ypad + 1) // 2
+    elif padding_type == b"VALID":
+        left_pad = 0
+        right_pad = 0
+        top_pad = 0
+        bottom_pad = 0
+    else:
+        assert 0, "Unknown padding"
+    padding = (top_pad, left_pad, bottom_pad, right_pad)
+    skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
+    return padding, skirt
+
+
+def fixup_conv2d_backprop(op, arch):
+    if op.type == "Conv2DBackpropInput":
+        # flip the inputs
+        op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
+        op.type = "Conv2DBackpropInputSwitched"
+
+    return op
+
+
+def fixup_fully_connected_input(op, arch):
+    if op.type == "FullyConnectedAct":
+        inp = op.inputs[0]
+        weights = op.inputs[1]
+
+        n_in_elems = weights.shape[-2]
+        elms = inp.elements()
+        batch_size = elms // n_in_elems
+        assert batch_size * n_in_elems == elms
+
+        desired_shape = [batch_size, n_in_elems]
+        if inp.shape != desired_shape:
+            # mismatch, insert a reshape to fix this.
+            reshape_name = op.name + "_reshape"
+            new_shape_tens = Tensor([1], DataType.int32, reshape_name + "_shape")
+            new_shape_tens.values = np.array(desired_shape)
+            new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+            new_shape_tens.ops = [new_shape_tens_const]
+            new_shape_tens_const.outputs = [new_shape_tens]
+
+            reshape_op = Operation("Reshape", reshape_name)
+            reshape_op.inputs = [inp, new_shape_tens]
+            reshape_op.attrs["new_shape"] = desired_shape
+            reshape_out = inp.clone("_reshaped")
+            reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape
+            reshape_out.ops = [reshape_op]
+            reshape_op.outputs = [reshape_out]
+
+            op.inputs[0] = reshape_out
+
+    return op
+
+
+def fixup_pack_input(op, arch):
+    if op.type == "Pack":
+        # Pack is also referred to as Stack
+        # Requires the rewrite_concat function to be called on the op afterwards
+        axis = int(op.attrs["axis"])
+        desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
+
+        # Construct 1 shape tensor to be used by all inserted reshape ops
+        new_shape_name = op.name + "_reshape_shape"
+        new_shape_tens = Tensor([1], DataType.int32, new_shape_name)
+        new_shape_tens.values = np.array(desired_shape)
+        new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+        new_shape_tens.ops = [new_shape_tens_const]
+        new_shape_tens_const.outputs = [new_shape_tens]
+
+        for idx, inp in enumerate(op.inputs):
+            reshape_name = op.name + str(idx) + "_reshape"
+            reshape_op = Operation("Reshape", reshape_name)
+            reshape_op.inputs = [inp, new_shape_tens]
+            reshape_op.attrs["new_shape"] = desired_shape
+            reshape_out = inp.clone("_reshaped")
+            reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape
+            reshape_out.ops = [reshape_op]
+            reshape_op.outputs = [reshape_out]
+
+            op.inputs[idx] = reshape_out
+
+        op.type = "PackReshaped"
+
+    return op
+
+
+def fixup_unpack_output(tens, arch):
+    op = tens.ops[0]
+    if op.type in set(("Unpack", "StridedSlice")):
+        # Unpack is also referred to as Unstack
+        # Requires the rewrite_split function to be called on the op afterwards
+        if op.type == "StridedSlice":
+            shrink_axis_mask = op.attrs["shrink_axis_mask"]
+            if shrink_axis_mask == 0:
+                # Equal Rank StridedSlice, no need to insert reshape
+                return tens
+
+            # Only allow shrinking 1 axis for now
+            assert shrink_axis_mask & (shrink_axis_mask - 1) == 0
+            assert len(tens.shape) == (len(op.inputs[0].shape) - 1)
+
+            axis = int(math.log2(shrink_axis_mask))
+            op.attrs["shrink_axis_mask"] = 0
+        else:
+            axis = int(op.attrs["axis"])
+            op.type = "UnpackReshaped"
+
+        desired_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
+
+        # Construct 1 shape tensor to be used by all inserted reshape ops
+        new_shape_name = op.name + "_reshape_shape"
+        new_shape_tens = Tensor([1], DataType.int32, new_shape_name)
+        new_shape_tens.values = np.array(tens.shape)
+        new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+        new_shape_tens.ops = [new_shape_tens_const]
+        new_shape_tens_const.outputs = [new_shape_tens]
+
+        for idx, out_tens in enumerate(op.outputs):
+            reshape_name = op.name + str(idx) + "_reshape"
+            reshape_op = Operation("Reshape", reshape_name)
+            reshape_op.outputs = [out_tens]
+            reshape_in = out_tens.clone("_reshaped")
+            reshape_in.shape = reshape_in.storage_shape = reshape_in.bandwidth_shape = desired_shape
+            reshape_in.ops = [op]
+            out_tens.ops = [reshape_op]
+            reshape_op.inputs = [reshape_in, new_shape_tens]
+
+            op.outputs[idx] = reshape_in
+
+    return tens
+
+
+def add_padding_fields(op, arch):
+    if "padding" in op.attrs:
+        if "Conv" in op.type:
+            kernel_size = op.inputs[1].shape[:2]
+            input_shape = op.inputs[0].shape
+        elif "Pool" in op.type:
+            kernel_size = op.attrs["ksize"][1:3]
+            input_shape = op.inputs[0].shape
+        elif op.type == "ExtractImagePatches":
+            kernel_size = op.attrs["ksizes"][1:3]
+            input_shape = op.inputs[0].shape
+        else:
+            assert 0, "Unknown operation that uses padding"
+
+        padding, skirt = calc_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape)
+        op.attrs["explicit_padding"] = padding
+        op.attrs["skirt"] = skirt
+    return op
+
+
+conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched", "Conv2DBiasAct"))
+fc_op = set(
+    (
+        "MatMul",
+        "QuantizedMatMul",
+        "BlockLSTM",
+        "RnnAct",
+        "UnidirectionalSequenceRnnAct",
+        "BidirectionalSequenceRnnAct",
+        "LstmAct",
+        "UnidirectionalSequenceLstmAct",
+        "BidirectionalSequenceLstmAct",
+        "FullyConnectedAct",
+    )
+)
+depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))
+pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct"))
+elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs"))
+activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh"))
+memory_only_ops = set(("Reshape",))
+
+# Check if the op can be reordered
+def get_prepend_op(op):
+    inp = op.inputs[0]
+    # The op should be reordered between prev_op and prep_op
+    prev_op = inp.ops[-1]
+    prep_op = None
+    while prev_op.type in memory_only_ops and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
+        prep_op = prev_op
+        inp = prev_op.inputs[0]
+        prev_op = inp.ops[-1]
+    if prev_op != None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
+        return prep_op
+
+    return None
+
+
+def mark_npu_block_type(op, arch):
+    npu_block_type = NpuBlockType.Default
+    if op.type in conv_op:
+        npu_block_type = NpuBlockType.ConvolutionMxN
+    elif op.type in fc_op:
+        npu_block_type = NpuBlockType.VectorProduct
+    elif op.type in depthwise_op:
+        npu_block_type = NpuBlockType.ConvolutionDepthWise
+    elif op.type in pool_op:
+        npu_block_type = NpuBlockType.Pooling
+    elif op.type in elementwise_op:
+        npu_block_type = NpuBlockType.ElementWise
+
+    op.attrs["npu_block_type"] = npu_block_type
+    return op
+
+
+def convert_depthwise_to_conv(op, arch):
+    # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
+    # the ofm depth equals the depth multipler.
+    # If those conditions are true, then we can perform a simple
+    # switch of the operator type (and weight order)
+
+    if ("DepthwiseConv2d" in op.type) and (op.attrs["depth_multiplier"] != 1):
+        ifm_tensor = op.inputs[0]
+        weight_tensor = op.inputs[1]
+        ofm_tensor = op.outputs[0]
+        if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]):
+            # Change op type to Conv2d
+            op.type = op.type.replace("DepthwiseConv2d", "Conv2D")
+            del op.attrs["channel_multiplier"]
+            del op.attrs["depth_multiplier"]
+
+            weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
+            weight_tensor.shape = weight_tensor.storage_shape = weight_tensor.bandwidth_shape = list(
+                weight_tensor.quant_values.shape
+            )
+        else:
+            print(
+                "Error: Unsupported DepthwiseConv2d with depth_multiplier = {0}, "
+                "ifm channels = {1}, ofm channels = {2}".format(
+                    op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3]
+                )
+            )
+            assert False
+    return op
+
+
+# Reorder activation op if it's after the memory only operations
+def fixup_act_reorder(op, arch):
+    if op.type in activation_ops:
+        prep_op = get_prepend_op(op)
+        if prep_op != None:
+            act_op = op.clone("_reordered")
+            act_op.inputs = [prep_op.inputs[0]]
+            act_op_out = act_op.inputs[0].clone("_acted")
+            act_op_out.quantization = op.outputs[0].quantization.clone()
+            act_op_out.ops = [act_op]
+            act_op.outputs = [act_op_out]
+            prep_op.inputs[0] = act_op_out
+            prep_op.outputs[0].quantization = act_op_out.quantization.clone()
+
+            # Mark the op so that it will be removed as passthrough later on
+            op.type = "Identity"
+    return op
+
+
+def convert_mul_max_to_abs_or_lrelu(op, arch):
+    """Whenever there is a subgraph with this topology:
+
+       Input    X   For X = -1 or X > 0
+       |   \   /    This subgraph can be replaced with either
+       |    Mul     an Abs (if X = -1) or a LeakyReLU (if X > 0)
+       |   /
+       Max
+    """
+
+    if op.type == "Maximum":
+        # finds the Mul input(s) to the Max
+        muls = [i for i in op.inputs if i.ops[0].type == "MulAct"]
+        if len(muls) == 1:
+            mul = muls[0].ops[0]
+        elif len(muls) == 2:
+            # In the case both inputs are Muls, find the one with the same input as the Max
+            mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
+        else:
+            # No Mul inputs
+            return op
+
+        # make sure the Mul doesn't have any other consumers
+        if len(mul.outputs[0].consumers()) != 1:
+            return op
+        # make sure the Mul doesn't have a faf
+        if mul.attrs["fused_activation_function"]:
+            return op
+
+        # finds the branched input that goes to both the Max and the Mul
+        shared = set(op.inputs) & set(mul.inputs)
+        if len(shared) == 1:
+            shared_in = shared.pop()
+            # find the constant scalar input to the Mul
+            const_tens = (set(mul.inputs) - {shared_in}).pop()
+            # check that it is a scalar
+            if const_tens.shape != []:
+                return op
+            const = const_tens.ops[0]
+            # check that it is a constant
+            if const.type != "Const":
+                return op
+        else:
+            return op
+
+        val = const.outputs[0].values
+        if val >= 0:
+            new_op = "LeakyRelu"
+            op.attrs["alpha"] = val
+        elif val == -1:
+            new_op = "Abs"
+        else:
+            return op
+
+        op.type = op.type.replace("Maximum", new_op)
+        op.name = op.name.replace("Maximum", new_op)
+        op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op)
+        op.inputs = [shared_in]
+    return op
+
+
+def supported_operator_check(op, arch):
+    op.run_on_npu = arch.supported_operators.is_operator_supported(op)
+    return op
+
+
+def optimise_graph_a(nng, arch, verbose_graph=False):
+    if verbose_graph:
+        nng.print_graph()
+
+    op_rewrite_list = [
+        # mark block type and check if the operations are supported
+        mark_npu_block_type,
+        supported_operator_check,
+        # then do any rewrites of supported operators
+        convert_depthwise_to_conv,
+        fixup_fully_connected_input,
+        fixup_pack_input,
+        fixup_conv2d_backprop,
+        fixup_act_reorder,
+        add_padding_fields,
+        mark_npu_block_type,
+        # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved
+    ]
+
+    for idx, sg in enumerate(nng.subgraphs):
+        # rewrite graph pass
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            sg, arch, [fixup_unpack_output,], op_rewrite_list, rewrite_unsupported=False
+        )
+
+    for idx, sg in enumerate(nng.subgraphs):
+        # remove passthrough tensors
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor,], [])
+
+    if verbose_graph:
+        nng.print_graph()
+    return nng
+
+def optimise_graph_b(nng, arch, verbose_graph=False):
+    if verbose_graph:
+        nng.print_graph()
+
+    for idx, sg in enumerate(nng.subgraphs):
+        # combined rewrite graph pass
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split,], [])
+
+    if verbose_graph:
+        nng.print_graph()
+    return nng
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
new file mode 100644
index 0000000..6b3d2c1
--- /dev/null
+++ b/ethosu/vela/greedy_allocation.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Allocate tensor addresses using a greedy algorithm.
+
+from . import numeric_util
+
+
+class GreedyAllocator:
+    def __init__(self, nng, arch, live_ranges, mem_area):
+        self.nng = nng
+        self.arch = arch
+        self.mem_area = mem_area
+
+        self.live_ranges = live_ranges
+        self.memory_required = 0
+
+        self.current_allocs = []
+
+    def alloc(self, new_lr):
+        size = new_lr.size
+        current_top = 0
+        if self.current_allocs:
+            current_top = max(start_addr + lr.size for start_addr, lr in self.current_allocs)
+        best_offset = numeric_util.round_up(current_top, new_lr.get_alignment())
+        best_offset_fit = (1 << 64) - 1
+
+        current_offset = 0
+        for start_addr, lr in self.current_allocs:
+            aligned_current_offset = numeric_util.round_up(current_offset, new_lr.get_alignment())
+            if aligned_current_offset + size <= start_addr and start_addr - current_offset < best_offset_fit:
+                best_offset = current_offset
+                best_offset_fit = start_addr - current_offset
+
+            current_offset = start_addr + lr.size
+
+        self.memory_required = max(self.memory_required, best_offset + size)
+        new_lr.set_address(best_offset)
+        self.current_allocs.append((best_offset, new_lr))
+        self.current_allocs = list(sorted(self.current_allocs))
+
+    def dealloc(self, lr_to_dealloc):
+        self.current_allocs = [(start_addr, lr) for start_addr, lr in self.current_allocs if lr != lr_to_dealloc]
+
+    def allocate_live_ranges(self, verbose_allocation):
+        lrs = set()
+        for lr in self.live_ranges.ranges.values():
+            lrs.add((lr.start_time, lr.end_time, lr))
+
+        lrs = sorted(lrs)
+
+        for curr_time, _, new_lr in lrs:
+            for _, lr in list(self.current_allocs):
+                if lr.end_time < curr_time:
+                    self.dealloc(lr)
+
+            self.alloc(new_lr)
+
+        assert self.verify_allocation()
+        return self.memory_required
+
+    def verify_allocation(self):
+        lrs = list(self.live_ranges.ranges.values())
+        for n in lrs:
+            for m in lrs:
+                if n != m and n.overlaps_ranges(m):
+                    overlap, tens_n, tens_m = n.overlaps_address(m)
+                    if overlap:
+                        print("Solution failed, overlapping buffer!")
+                        print(tens_n.address, tens_n.address + n.size, n.name)
+                        print(tens_m.address, tens_m.address + m.size, m.name)
+                        print()
+                        return False
+
+        return True
+
+
+def allocate_live_ranges(nng, arch, live_ranges, mem_area, verbose_allocation=False):
+    g = GreedyAllocator(nng, arch, live_ranges, mem_area)
+    return g.allocate_live_ranges(verbose_allocation)
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
new file mode 100644
index 0000000..952e203
--- /dev/null
+++ b/ethosu/vela/high_level_command_stream.py
@@ -0,0 +1,365 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe).
+
+from enum import Enum, IntEnum
+import numpy as np
+from .operation import NpuBlockType
+from .numeric_util import round_up_divide
+from .range_set import MemoryAccessSet, AccessDirection
+
+
+class Box:
+    def __init__(self, start_coord, end_coord):
+        self.start_coord = list(start_coord)
+        self.end_coord = list(end_coord)
+        assert len(self.start_coord) == len(end_coord)
+        for i in range(len(self.start_coord)):
+            assert self.start_coord[i] <= self.end_coord[i]
+
+    def transform_with_strides_and_skirt(
+        self, strides, skirt, ifm_shape, npu_block_type, concat_axis=0, concat_offset=0, split_offset=None, k_height=1
+    ):
+        new_start_coord = list(self.start_coord)
+        new_end_coord = list(self.end_coord)
+
+        new_start_coord[concat_axis] -= concat_offset
+        new_end_coord[concat_axis] -= concat_offset
+
+        if split_offset != None:
+            for idx in range(len(split_offset)):
+                new_start_coord[idx] += split_offset[idx]
+                new_end_coord[idx] += split_offset[idx]
+
+        if split_offset == None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+            # these types of operations do a "dot product" over the entire IFM
+            new_start_coord[-1] = 0
+            new_end_coord[-1] = ifm_shape[-1]
+
+        if min(len(new_end_coord), len(ifm_shape)) >= 2:
+            new_end_coord[-2] = min(new_end_coord[-2], ifm_shape[-2])
+        if min(len(new_end_coord), len(ifm_shape)) >= 3:
+            new_end_coord[-3] = min(new_end_coord[-3], ifm_shape[-3])
+
+        pad_top = 0
+        pad_bottom = 0
+        if strides is not None and skirt is not None:
+            if len(new_start_coord) >= 2:
+                stride = strides[2]
+                new_start_coord[-2] = max(new_start_coord[-2] * stride - skirt[1], 0)
+                new_end_coord[-2] = min(new_end_coord[-2] * stride + skirt[3], ifm_shape[-2])
+
+            if len(new_start_coord) >= 3:
+                stride = strides[1]
+
+                total_stride = stride * (new_end_coord[-3] - new_start_coord[-3] - 1)
+                new_start_coord[-3] = new_start_coord[-3] * stride - skirt[0]
+
+                pad_top = max(0, 0 - new_start_coord[-3])
+                new_start_coord[-3] = max(new_start_coord[-3], 0)
+
+                while len(ifm_shape) < 3:
+                    ifm_shape = [1] + ifm_shape
+                if (new_end_coord[-3] * stride + skirt[2]) > ifm_shape[-3]:
+                    # pad_bottom is calculated based the diff between the end position of the weight kernel,
+                    # after last stride and the ifm height.
+                    k_start = new_start_coord[-3] - pad_top
+                    pad_bottom = max(0, k_start + total_stride + k_height - ifm_shape[-3])
+
+                new_end_coord[-3] = min(new_end_coord[-3] * stride + skirt[2], ifm_shape[-3])
+
+        return Box(new_start_coord, new_end_coord), pad_top, pad_bottom
+
+    def make_weight_box(weight_shape, npu_block_type, oc_range_start=None, oc_range_end=None, weights_transposed=False):
+        start = [0] * len(weight_shape)
+        end = list(weight_shape)
+        if oc_range_start is not None and oc_range_end is not None:
+            if npu_block_type == NpuBlockType.ConvolutionDepthWise:
+                # input range is output range divided by channel multiplier
+                if weights_transposed:
+                    start[-1] = oc_range_start // weight_shape[-2]
+                    end[-1] = oc_range_end // weight_shape[-2]
+                else:
+                    start[-2] = oc_range_start // weight_shape[-1]
+                    end[-2] = oc_range_end // weight_shape[-1]
+            else:
+                start[-1] = oc_range_start
+                end[-1] = oc_range_end
+        for i in range(len(end)):
+            assert 0 <= start[i] < weight_shape[i]
+            assert 0 < end[i] <= weight_shape[i]
+
+        return Box(start, end)
+
+    def get_size_shape(self):
+        return [int(self.end_coord[i] - self.start_coord[i]) for i in range(len(self.end_coord))]
+
+    def get_size(self):
+        return int(np.prod(self.get_size_shape()))
+
+    def __str__(self):
+        return "<Box %s - %s>" % (self.start_coord, self.end_coord)
+
+    __repr__ = __str__
+
+
+class CommandType(IntEnum):
+    NpuStripe = 0
+    DMA = 1
+    Size = 2
+
+
+class Command:
+    def get_ofm_y_range_for_pass(self, ps_requested):
+        return None
+
+    def is_npu_pass_command(self):
+        return False
+
+    def get_memory_accesses(self):
+        return None
+
+    def get_operation_count(self):
+        # returns numpy array of (DPU blocks, dma_ops). Should line up with the CommandType enum
+        return np.array((0, 0))
+
+
+class NpuStripe(Command):
+    def __init__(
+        self,
+        ps,
+        block_config,
+        is_first,
+        is_last,
+        is_first_h_stripe,
+        is_last_h_stripe,
+        ifm_tensor,
+        ifm_box,
+        ofm_tensor,
+        ofm_box,
+        weight_tensor=None,
+        weight_box=None,
+        scale_tensor=None,
+        concat_axis=0,
+        concat_offset=0,
+        ifm2_tensor=None,
+        ifm2_box=None,
+        pad_top=0,
+        pad_bottom=0,
+    ):
+        self.cmdtype = CommandType.NpuStripe
+        self.ps = ps
+        self.block_config = block_config
+        self.is_first = is_first
+        self.is_last = is_last
+        self.is_first_h_stripe = is_first_h_stripe
+        self.is_last_h_stripe = is_last_h_stripe
+        self.ifm_tensor = ifm_tensor
+        self.ifm_box = ifm_box
+        self.ifm2_tensor = ifm2_tensor
+        self.ifm2_box = ifm2_box
+        self.ofm_tensor = ofm_tensor
+        self.ofm_box = ofm_box
+        self.weight_tensor = weight_tensor
+        self.scale_tensor = scale_tensor
+        self.weight_box = weight_box
+        self.concat_axis = concat_axis
+        self.concat_offset = concat_offset
+        self.pad_top = pad_top
+        self.pad_bottom = pad_bottom
+        for i in range(len(self.ofm_box.end_coord)):
+            assert self.ofm_box.end_coord[i] <= self.ofm_tensor.shape[i]
+
+    def get_memory_accesses(self):
+        res = MemoryAccessSet()
+        if self.ifm_tensor is not None and self.ifm_tensor.shape != []:
+            res.add(
+                self.ifm_tensor.get_address_ranges_for_coordinates(self.ifm_box.start_coord, self.ifm_box.end_coord),
+                AccessDirection.Read,
+            )
+        if self.ifm2_tensor is not None and self.ifm2_tensor.shape != []:
+            res.add(
+                self.ifm2_tensor.get_address_ranges_for_coordinates(self.ifm2_box.start_coord, self.ifm2_box.end_coord),
+                AccessDirection.Read,
+            )
+        if self.ofm_tensor is not None:
+            res.add(
+                self.ofm_tensor.get_address_ranges_for_coordinates(self.ofm_box.start_coord, self.ofm_box.end_coord),
+                AccessDirection.Write,
+            )
+        if self.weight_tensor is not None:
+            res.add(
+                self.weight_tensor.get_address_ranges_for_coordinates(
+                    self.weight_box.start_coord, self.weight_box.end_coord
+                ),
+                AccessDirection.Read,
+            )
+        return res
+
+    def is_npu_pass_command(self):
+        return True
+
+    def __str__(self):
+        return "<NPUStripe: ps=%s, ifm_box=%s, ifm2_box=%s, ofm_box=%s, weight_box=%s, block_config=%s>" % (
+            self.ps.name,
+            self.ifm_box,
+            self.ifm2_box,
+            self.ofm_box,
+            self.weight_box,
+            self.block_config,
+        )
+
+    __repr__ = __str__
+
+    def get_ofm_y_range_for_pass(self, ps_requested):
+        if ps_requested != self.ps:
+            return None
+        if len(self.ofm_box.start_coord) >= 3:
+            return (self.ofm_box.start_coord[-3], self.ofm_box.end_coord[-3])
+        return None
+
+    def get_block_dimensions(self):
+        ofm_box = self.ofm_box
+        block_config = self.block_config
+
+        out_height = 1
+        out_width = 1
+        out_depth = ofm_box.end_coord[-1] - ofm_box.start_coord[-1]
+        if len(ofm_box.end_coord) >= 4:
+            out_width = ofm_box.end_coord[-2] - ofm_box.start_coord[-2]
+            out_height = ofm_box.end_coord[-3] - ofm_box.start_coord[-3]
+
+        assert out_height >= 0
+        assert out_width >= 0
+        assert out_depth >= 0
+        return (
+            round_up_divide(out_height, block_config[0]),
+            round_up_divide(out_width, block_config[1]),
+            round_up_divide(out_depth, block_config[3]),
+        )
+
+    def get_operation_count(self):
+        # returns numpy array of (DPU blocks, dma_ops)
+        return np.array((self.get_n_blocks(), 0))
+
+    def get_n_blocks(self):
+        h, w, d = self.get_block_dimensions()
+        res = h * w * d
+        assert res >= 0
+        return res
+
+    def get_single_block_command(self, block_idx):
+        block_cfg = (self.block_config[0], self.block_config[1], self.block_config[3])
+        dims = self.get_block_dimensions()
+        strides = dims[1] * dims[2], dims[2], 1
+        coord = []
+        idx_left = block_idx
+        for s in strides:
+            c = idx_left // s
+            idx_left -= c * s
+            coord.append(c)
+
+        assert idx_left == 0
+
+        # put in dummy height/widths in case we're dealing with FC layers
+        ofm_start = list(self.ofm_box.start_coord)
+        ofm_end = list(self.ofm_box.end_coord)
+
+        # cut out a nice block shape
+        for idx in (-1, -2, -3):
+            if len(ofm_start) >= -idx:
+                ofm_start[idx] += block_cfg[idx] * coord[idx]
+                ofm_end[idx] = min(ofm_end[idx], ofm_start[idx] + block_cfg[idx])
+
+        ps = self.ps
+        strides = None
+        skirt = None
+        if ps.primary_op is not None:
+            strides = ps.primary_op.attrs.get("strides", None)
+            skirt = ps.primary_op.attrs.get("skirt", None)
+        npu_block_type = ps.npu_block_type
+
+        ofm_box = Box(ofm_start, ofm_end)
+        ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+            strides, skirt, self.ifm_tensor.shape, npu_block_type, self.concat_axis, self.concat_offset
+        )
+
+        weight_box = None
+        if self.weight_tensor is not None:
+            weight_oc_start = ofm_start[-1]
+            weight_oc_end = ofm_end[-1]
+            if self.concat_axis - len(self.weight_tensor.shape) == -1:
+                weight_oc_start -= self.concat_offset
+                weight_oc_end -= self.concat_offset
+
+            weight_box = Box.make_weight_box(
+                self.weight_tensor.shape,
+                npu_block_type,
+                weight_oc_start,
+                weight_oc_end,
+                self.weight_tensor.weight_transpose_depthwise,
+            )
+
+        return NpuStripe(
+            self.ps,
+            self.block_config,
+            self.is_first,
+            self.is_last,
+            self.is_first_h_stripe,
+            self.is_last_h_stripe,
+            self.ifm_tensor,
+            ifm_box,
+            self.ofm_tensor,
+            ofm_box,
+            self.weight_tensor,
+            weight_box,
+            self.scale_tensor,
+            self.concat_axis,
+            self.concat_offset,
+        )
+
+
+class DMA(Command):
+    def __init__(self, in_tensor, out_tensor, box):
+        self.cmdtype = CommandType.DMA
+        self.in_tensor = in_tensor
+        self.out_tensor = out_tensor
+        self.box = box
+
+    def __str__(self):
+        return "<DMA: in=%s, out=%s, box=%s>" % (self.in_tensor.name, self.out_tensor.name, self.box)
+
+    __repr__ = __str__
+
+    def get_memory_accesses(self):
+        res = MemoryAccessSet()
+
+        res.add(
+            self.in_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord),
+            AccessDirection.Read,
+        )
+        res.add(
+            self.out_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord),
+            AccessDirection.Write,
+        )
+        return res
+
+    def get_operation_count(self):
+        # returns numpy array of (DPU blocks, dma_ops)
+        return np.array((0, 1))
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
new file mode 100644
index 0000000..364df6f
--- /dev/null
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -0,0 +1,315 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
+#
+# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
+# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
+
+from .nn_graph import SchedulingStrategy, PassPlacement
+import numpy as np
+from .operation import NpuBlockType
+from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
+
+
+def need_dma(tens):
+    return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
+
+
+def dma_weights_if_necessary(ps, box, weight_tensor):
+    if need_dma(weight_tensor):
+        dma_op = weight_tensor.ops[0]
+        in_tensor = dma_op.inputs[0]
+        yield DMA(in_tensor, weight_tensor, box)
+
+
+def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
+    is_first = idx == 0
+    is_last = idx == len(passes) - 1
+    ps = passes[idx]
+    block_config = block_configs[idx]
+
+    ifm_tensor = ps.ifm_tensor
+    ifm2_tensor = ps.ifm2_tensor
+    ofm_tensor = ps.ofm_tensor
+    weight_tensor = ps.weight_tensor
+    scale_tensor = ps.scale_tensor
+
+    ofm_start = [0] * len(ofm_tensor.shape)
+    ofm_end = list(ofm_tensor.shape)
+
+    strides = None
+    skirt = None
+    if ps.primary_op is not None:
+        strides = ps.primary_op.attrs.get("strides", None)
+        skirt = ps.primary_op.attrs.get("skirt", None)
+
+    npu_block_type = ps.npu_block_type
+
+    concat_axis = 0
+    concat_offset = 0
+
+    split_offsets = [None, None]  # offset for [ifm, ifm2]
+
+    # Fusable activation functions
+    activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
+
+    for op in ps.ops:
+        if op.type == "ConcatSliceWrite":
+            concat_axis = op.attrs["concat_axis"]
+            concat_start = op.attrs["concat_start"]
+            concat_end = op.attrs["concat_end"]
+
+            ofm_start[concat_axis] = concat_start
+            ofm_end[concat_axis] = concat_end
+            concat_offset = concat_start
+            ps.primary_op.attrs["fused_memory_function"] = op.type
+        elif op.type in activation_ops:
+            ps.primary_op.attrs["fused_activation_function"] = op.type
+
+    # The ops list has to be reversed here since the Pass Packing is done in reverse
+    ifm_idx = 0
+    for op in reversed(ps.ops):
+        if op.type == "SplitSliceRead":
+            split_offsets[ifm_idx] = op.attrs["split_start"]
+            ps.primary_op.attrs["fused_memory_function"] = op.type
+            ifm_idx += 1
+
+    if strat == SchedulingStrategy.WeightStream:
+        ofm_step = block_config[-1]
+        ofm_stop = ofm_end[-1]
+        if weight_tensor is None or not need_dma(weight_tensor):
+            ofm_step = ofm_stop
+        for start in range(ofm_start[-1], ofm_stop, ofm_step):
+            end = min(start + ofm_step, ofm_stop)
+            ofm_start[-1] = start
+            ofm_end[-1] = end
+            ofm_box = Box(ofm_start, ofm_end)
+            ifm_box = None
+            ifm2_box = None
+
+            if ifm_tensor.shape != []:
+                ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                    strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+                )
+            else:
+                ifm_box = Box([], [])
+            if ifm2_tensor is not None and ifm2_tensor.shape != []:
+                ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                    strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
+                )
+            else:
+                ifm2_box = Box([], [])
+
+            weight_box = None
+            if weight_tensor is not None:
+                weight_oc_start = start
+                weight_oc_end = end
+                if concat_axis - len(weight_tensor.shape) == -1:
+                    weight_oc_start -= concat_offset
+                    weight_oc_end -= concat_offset
+
+                weight_box = Box.make_weight_box(
+                    weight_tensor.shape,
+                    npu_block_type,
+                    weight_oc_start,
+                    weight_oc_end,
+                    weight_tensor.weight_transpose_depthwise,
+                )
+                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+            yield NpuStripe(
+                ps,
+                block_config,
+                is_first,
+                is_last,
+                True,
+                True,
+                ifm_tensor,
+                ifm_box,
+                ofm_tensor,
+                ofm_box,
+                weight_tensor,
+                weight_box,
+                scale_tensor,
+                concat_axis,
+                concat_offset,
+                ifm2_tensor=ifm2_tensor,
+                ifm2_box=ifm2_box,
+            )
+
+    elif strat == SchedulingStrategy.IfmStream:
+        y_step = block_config[0]
+        y_start = 0
+        y_dim = 1
+        if len(ofm_tensor.shape) >= 3:
+            y_start = ofm_start[-3]
+            y_dim = ofm_end[-3]
+        if idx > 0:
+            ifm_y_present = 0
+            prev_pass = passes[idx - 1]
+            prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
+        else:
+            ifm_y_present = 1
+            if len(ifm_tensor.shape) >= 3:
+                ifm_y_present = ifm_tensor.shape[-3]
+            prev_pass_gen = []
+            prev_pass = None
+
+        if len(passes) == 1:
+            # no cascading, can just issue one big stripe
+            # but only if we've done allocation and OFM does not overlap IFM
+            if ifm_tensor.address != -1 and ofm_tensor.address != -1:
+                if (
+                    ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
+                    or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
+                ):
+                    y_step = y_dim
+
+        weight_box = None
+
+        for start in range(y_start, y_dim, y_step):
+            end = min(start + y_step, y_dim)
+            if len(ofm_tensor.shape) >= 3:
+                ofm_start[-3] = start
+                ofm_end[-3] = end
+            ofm_box = Box(ofm_start, ofm_end)
+
+            k_height = 1
+            if npu_block_type == NpuBlockType.Pooling:
+                if ps.primary_op is not None:
+                    k_height = ps.primary_op.attrs["ksize"][1]
+            else:
+                if weight_tensor is not None:
+                    k_height = weight_tensor.shape[0]
+
+            ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
+                strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
+            )
+
+            ifm_y_needed = 1
+            if len(ifm_box.end_coord) >= 3:
+                ifm_y_needed = ifm_box.end_coord[-3]
+            if ifm_y_present < ifm_y_needed:
+                for prev_cmd in prev_pass_gen:
+                    yield prev_cmd
+                    rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
+                    if rng is not None:
+                        ifm_y_present = max(ifm_y_present, rng[1])
+                        if ifm_y_present >= ifm_y_needed:
+                            break
+
+            if weight_tensor is not None and weight_box is None:
+                weight_box = Box.make_weight_box(
+                    weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
+                )
+                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+            # Check if first/last stripe in pass
+            is_first_h_stripe = start == y_start
+            is_last_h_stripe = (start + y_step) >= y_dim
+
+            stripe = NpuStripe(
+                ps,
+                block_config,
+                is_first,
+                is_last,
+                is_first_h_stripe,
+                is_last_h_stripe,
+                ifm_tensor,
+                ifm_box,
+                ofm_tensor,
+                ofm_box,
+                weight_tensor,
+                weight_box,
+                scale_tensor,
+                concat_axis,
+                concat_offset,
+                None,
+                None,
+                pad_top,
+                pad_bottom,
+            )
+            yield stripe
+    else:
+        assert 0, "unknown scheduling strategy"
+
+
+def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+    if strat == SchedulingStrategy.WeightStream:
+        for idx in range(len(passes)):
+            yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
+    elif strat == SchedulingStrategy.IfmStream:
+        yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
+    else:
+        assert 0, "Unknown streaming strategy"
+
+
+def generate_high_level_command_stream_for_cascaded_pass(cps):
+    yield from generate_high_level_command_stream_for_pass_list(
+        cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
+    )
+
+
+def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
+    res = []
+    for cps in sg.cascaded_passes:
+        if cps.placement == PassPlacement.Npu:
+            res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
+
+    sg.high_level_command_stream = res
+    if verbose_high_level_command_stream:
+        sg.print_high_level_command_stream()
+
+
+def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
+    highest_ofm_write = 0
+    if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
+        return 0
+
+    ifm_read = passes[0].ifm_tensor.storage_size
+    min_overlap = 999999999999999999999
+    ofm_size = passes[-1].ofm_tensor.storage_size()
+    if strat == SchedulingStrategy.WeightStream:
+        return 0
+    for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+        if cmd.is_npu_pass_command():
+            if cmd.is_first:
+                ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
+                if ifm_read is None:
+                    return 0
+            if cmd.is_last:
+                write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
+                if write_offset is None:
+                    return 0
+                highest_ofm_write = max(write_offset, highest_ofm_write)
+
+            if cmd.is_first or cmd.is_last:
+                overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
+                can_overwrite = ofm_size - overlap_required
+                min_overlap = min(min_overlap, can_overwrite)
+
+            if cmd.is_first:
+                ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
+
+    min_overlap = max(min_overlap, 0)
+    return min_overlap
+
+
+def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
+    return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
new file mode 100644
index 0000000..b63c1ea
--- /dev/null
+++ b/ethosu/vela/insert_dma.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Insert DMA operations into the graph for transfering weights.
+
+from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType
+from . import rewrite_graph
+
+
+def insert_dma_cmd(op, arch):
+    if op.type == "DMA":
+        return op # Already rewritten
+    for idx, tens in enumerate(op.inputs):
+
+        if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
+            if tens.purpose == TensorPurpose.Weights:
+                only_vector_product_consumers = True
+                for oper in tens.consumers():
+                    if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
+                        only_vector_product_consumers = False
+                        break
+
+                # Tensor products has no need for DMA, tensors are only read once and can be in flash.
+                # Other operations re-reads tensors, this is better done from SRAM.
+                if not only_vector_product_consumers:
+                    # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
+                    new_tens = tens.clone_into_fast_storage(arch)
+                    dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
+                    dma_cmd.inputs = [tens]
+                    dma_cmd.outputs = [new_tens]
+                    dma_cmd.attrs["source"] = tens.mem_area
+                    dma_cmd.attrs["destination"] = new_tens.mem_area
+                    dma_cmd.run_on_npu = True
+                    new_tens.ops = [dma_cmd]
+                    op.inputs[idx] = new_tens
+    return op
+
+
+def insert_dma_commands(nng, arch, verbose_graph=False):
+
+    for idx, sg in enumerate(nng.subgraphs):
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [insert_dma_cmd])
+    if verbose_graph:
+        nng.print_graph()
+    return nng
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
new file mode 100644
index 0000000..24f1f64
--- /dev/null
+++ b/ethosu/vela/live_range.py
@@ -0,0 +1,324 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler.
+# Can work with either a pass packed subgraph or a scheduled subgraph.
+
+from .tensor import Tensor, MemArea
+from .nn_graph import TensorPurpose, PassPlacement
+from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
+
+
+class LiveRange:
+    def __init__(self, tens):
+        self.tensors = []  # Tensors that are assigned to the same LiveRange will be allocated to the same address
+        self.start_time = 99999999999
+        self.end_time = -1
+        self.size = 0
+        self.name = ""
+
+        if tens:
+            self.add_tensor(tens)
+
+    def __str__(self):
+        return "<live_range.LiveRange: '%s' start_time=%s, end_time=%s>" % (self.name, self.start_time, self.end_time)
+
+    __repr__ = __str__
+
+    def add_tensor(self, tens):
+        if self.size == 0:
+            self.size = tens.storage_size()
+            self.name = tens.name  # LiveRange will be named after the first tensor added
+        else:
+            assert (
+                self.size >= tens.storage_size()
+            ), "Tensors assigned to the same LiveRange need to fit the size of the LiveRange."
+
+        self.tensors.append(tens)
+
+    def mark_usage(self, op_time):
+        if op_time == -1:
+            return
+        op_time_start = op_time
+        op_time_end = op_time + 1
+
+        self.start_time = min(self.start_time, op_time_start)
+        self.end_time = max(self.end_time, op_time_end)
+
+    def overlaps_ranges(self, other):
+        return max(self.start_time, other.start_time) < min(self.end_time, other.end_time)
+
+    def overlaps_address(self, other):
+        # Returns the first pair of tensors in this LiveRange and 'other' which have
+        # overlapping addresses
+        for tens in self.tensors:
+            for other_tens in other.tensors:
+                if max(tens.address, other_tens.address) < min(
+                    tens.address + self.size, other_tens.address + other.size
+                ):
+                    return True, tens, other_tens
+
+        return False, None, None
+
+    def __lt__(self, other):
+        if self.start_time != other.start_time:
+            return self.start_time < other.start_time
+        if self.end_time != other.end_time:
+            return self.end_time < other.end_time
+        if self.size != other.size:
+            return self.size < other.size
+        return self.name < other.name
+
+    def set_address(self, address):
+        # Set address of all unaddressed tensors in LiveRange
+        for tens in self.tensors:
+            if tens.address == 0:
+                tens.address = address
+                # Also need to set the address to the tensor's cpu/npu clones
+                if tens.cpu_tensor != None:
+                    tens.cpu_tensor.address = address
+                if tens.npu_tensor != None:
+                    tens.npu_tensor.address = address
+
+    def get_alignment(self):
+        # Get max alignment of LiveRange's tensors
+        if self.tensors:
+            alignment = 0
+            for tens in self.tensors:
+                alignment = max(alignment, tens.alignment)
+
+            return alignment
+
+        return Tensor.AllocationQuantum
+
+
+def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area):
+    for ps in sg.passes:
+        if ps.placement == PassPlacement.MemoryOnly:
+            # For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
+            input_tensor = ps.inputs[0]
+            output_tensor = ps.outputs[0]
+            # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input
+            # or output, fuse the live-range with the Cpu tensors' live-range instead.
+            input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor != None else input_tensor
+            output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor != None else output_tensor
+            if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored(
+                output_tensor, target_mem_area
+            ):
+                lr_graph.fuse_ranges(input_tensor, output_tensor)
+
+
+class LiveRangeGraph:
+    def __init__(self):
+        self.ranges = {}  # tens -> range
+        self.allowed_overlaps = {}  # (tens,tens) -> overlap_int
+        self.ignore_tensors = set()
+        self.processed_subgraphs = set()
+        self.current_time = 0
+
+    def get_or_create_range(self, tens):
+        for rng in self.ranges.values():
+            # Return the live range of the tensor (or it's cpu/npu clone)
+            if any(tensor in rng.tensors for tensor in [tens, tens.npu_tensor, tens.cpu_tensor]):
+                return rng
+
+        # No live range found for the tensor, create a new one
+        rng = LiveRange(tens)
+        self.ranges[tens] = rng
+        return rng
+
+    def fuse_ranges(self, in_tens, out_tens):
+        live_range = self.get_or_create_range(in_tens)
+        assert out_tens not in self.ranges, out_tens
+        live_range.add_tensor(out_tens)
+        self.ranges[out_tens] = live_range
+        return live_range
+
+
+def extract_live_ranges_from_passes(
+    sg,
+    target_mem_area,
+    mark_output_tensors_overlapping_with_input_tensors=False,
+    ignore_subgraph_input_output_tensors=False,
+):
+    lr_graph = LiveRangeGraph()
+
+    if ignore_subgraph_input_output_tensors:
+        lr_graph.ignore_tensors.update(sg.input_tensors)
+        lr_graph.ignore_tensors.update(sg.output_tensors)
+
+    def tensor_should_be_ignored(tens, target_mem_area):
+        if tens.mem_area != target_mem_area:
+            return True
+        if tens in lr_graph.ignore_tensors:
+            return True
+        if tens.name.endswith("reshape_shape_npu"):
+            # Reshape tensor, no need to allocate
+            lr_graph.ignore_tensors.add(tens)
+            return True
+        return False
+
+    # Merge only memory operations in the NPU subgraphs
+    if sg.placement == PassPlacement.Npu:
+        merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+
+    for idx, ps in enumerate(sg.passes):
+        ps.time = 2 * idx
+
+        time_for_pass = ps.time
+
+        for tens in ps.inputs:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            rng.mark_usage(time_for_pass)
+
+        for tens in ps.intermediates:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            rng.mark_usage(time_for_pass)
+
+        for tens in ps.outputs:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            output_time = time_for_pass
+            if not mark_output_tensors_overlapping_with_input_tensors and ps.is_element_wise:
+                output_time += 1
+            rng.mark_usage(output_time)
+
+    end_time = len(sg.passes) * 2
+    for tens in sg.output_tensors:
+        if tensor_should_be_ignored(tens, target_mem_area):
+            continue
+        rng = lr_graph.get_or_create_range(tens)
+        rng.mark_usage(end_time)
+
+    return lr_graph
+
+
+def extract_live_ranges_from_cascaded_passes(
+    sg,
+    target_mem_area,
+    mark_output_tensors_overlapping_with_input_tensors=False,
+    use_ifm_ofm_overlap=True,
+    ignore_subgraph_input_output_tensors=False,
+    lr_graph=None,
+):
+    if lr_graph == None:
+        lr_graph = LiveRangeGraph()
+
+    if sg in lr_graph.processed_subgraphs:
+        # if subgraph has been processed already, return the lr_graph as is
+        return lr_graph
+
+    if ignore_subgraph_input_output_tensors:
+        lr_graph.ignore_tensors.update(sg.input_tensors)
+        lr_graph.ignore_tensors.update(sg.output_tensors)
+
+    def tensor_should_be_ignored(tens, target_mem_area):
+        if tens.mem_area != target_mem_area:
+            return True
+        if tens in lr_graph.ignore_tensors:
+            return True
+        if tens.name.endswith("reshape_shape_npu"):
+            # Reshape tensor, no need to allocate
+            lr_graph.ignore_tensors.add(tens)
+            return True
+        return False
+
+    # Merge only memory operations in the NPU subgraphs
+    if sg.placement == PassPlacement.Npu:
+        merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+
+    for cps in sg.cascaded_passes:
+        cps.time = lr_graph.current_time
+
+        time_for_pass = cps.time
+
+        is_element_wise = cps.is_element_wise
+
+        for tens in cps.inputs:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            rng.mark_usage(time_for_pass)
+
+        cps_primary_op = cps.passes[0].primary_op
+        if cps_primary_op and cps_primary_op.type == "NpuOp" and target_mem_area in set((MemArea.Sram, MemArea.Dram)):
+            # If the primary-op is an NpuOp that means this is where an Npu subgraph
+            # is called. Go into said subgraph and extract live ranges before continuing.
+            npu_sg = cps_primary_op.attrs["subgraph"]
+            lr_graph = extract_live_ranges_from_cascaded_passes(
+                npu_sg,
+                target_mem_area,
+                mark_output_tensors_overlapping_with_input_tensors,
+                use_ifm_ofm_overlap,
+                False,
+                lr_graph,
+            )
+            # Set the new time after handling the Npu subgraph
+            time_for_pass = lr_graph.current_time
+            cps.time = time_for_pass
+
+        for tens in cps.intermediates:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            rng.mark_usage(time_for_pass)
+
+        for tens in cps.outputs:
+            if tensor_should_be_ignored(tens, target_mem_area):
+                continue
+            rng = lr_graph.get_or_create_range(tens)
+            output_time = time_for_pass
+            if not mark_output_tensors_overlapping_with_input_tensors and is_element_wise:
+                output_time += 1
+            rng.mark_usage(output_time)
+
+        if use_ifm_ofm_overlap:
+            # fill allowed overlap for ifm and ofm tensor
+            ifm_tensor = cps.passes[0].ifm_tensor
+            ofm_tensor = cps.passes[-1].ofm_tensor
+            if (
+                ifm_tensor is not None
+                and ofm_tensor is not None
+                and not tensor_should_be_ignored(ifm_tensor, target_mem_area)
+                and not tensor_should_be_ignored(ofm_tensor, target_mem_area)
+            ):
+                lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass(
+                    cps
+                )
+
+        lr_graph.current_time += 2
+
+    end_time = 0
+    for rng in lr_graph.ranges.values():
+        # Find the maximum end time of all live-ranges in the graph
+        end_time = max(end_time, rng.end_time)
+
+    for tens in sg.output_tensors:
+        if tensor_should_be_ignored(tens, target_mem_area):
+            continue
+        rng = lr_graph.get_or_create_range(tens)
+        rng.mark_usage(end_time)
+
+    # Add subgraph to set of processed subgraphs
+    lr_graph.processed_subgraphs.add(sg)
+    return lr_graph
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
new file mode 100644
index 0000000..9b1824b
--- /dev/null
+++ b/ethosu/vela/mark_tensors.py
@@ -0,0 +1,363 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Mark purpose and select formats for Tensors. Also compresses the weights.
+
+from . import rewrite_graph
+from . import weight_compressor
+from .architecture_features import Block
+from .nn_graph import TensorPurpose, TensorFormat, PassPlacement
+from .operation import NpuBlockType
+
+
+def purpose_from_list(lst):
+    def purpose(op, idx):
+        return lst[idx]
+
+    return purpose
+
+
+def all_fm(op, idx):
+    return TensorPurpose.FeatureMap
+
+
+def all_parameter(op, idx):
+    return TensorPurpose.FeatureMap
+
+
+def input0_from_output_rest_parameter(op, idx):
+    if idx == 0:
+        res = op.outputs[0].purpose
+        if res == TensorPurpose.Unknown:
+            print("Warning: Propagating unknown tensor purpose", op)
+        return res
+    return TensorPurpose.FeatureMap
+
+
+def inputs_from_output(op, idx):
+    res = op.outputs[0].purpose
+    if res == TensorPurpose.Unknown:
+        print("Warning: Propagating unknown tensor purpose", op)
+    return res
+
+tensor_purposes = [  # ops, input_purpose
+    (
+        set(
+            (
+                "Relu",
+                "Relu6",
+                "Mul",
+                "Add",
+                "Sub",
+                "Rsqrt",
+                "Abs",
+                "Cast",
+                "Exp",
+                "Floor",
+                "FloorDiv",
+                "FloorMod",
+                "SquaredDifference",
+                "AddN",
+                "BiasAdd",
+                "RealDiv",
+                "Maximum",
+                "Minimum",
+                "Sigmoid",
+                "Tanh",
+                "FusedBatchNorm",
+                "AvgPool",
+                "MaxPool",
+                "Squeeze",
+                "Softmax",
+                "LRN",
+                "Assign",
+                "BatchMatMul",
+                "ZerosLike",
+                "ExtractImagePatches",
+                "MulAct",
+                "AddAct",
+                "SubAct",
+                "DivAct",
+                "AvgPoolAct",
+                "MaxPoolAct",
+                "LeakyRelu",
+            )
+        ),
+        all_fm,
+    ),
+    (
+        set(
+            (
+                "Conv2D",
+                "DepthwiseConv2dNative",
+                "MatMul",
+                "Conv2DBiasAct",
+                "DepthwiseConv2dBiasAct",
+                "FullyConnectedAct",
+            )
+        ),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
+    ),
+    (
+        set(("Conv2DBackpropInputSwitched",)),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
+    ),
+    (
+        set(("QuantizedConv2D", "QuantizedMatMul")),
+        purpose_from_list(
+            [
+                TensorPurpose.FeatureMap,
+                TensorPurpose.Weights,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+            ]
+        ),
+    ),
+    (
+        set(
+            (
+                "Reshape",
+                "Min",
+                "Max",
+                "Mean",
+                "Pad",
+                "MirrorPad",
+                "ArgMax",
+                "ArgMin",
+                "ExpandDims",
+                "ResizeNearestNeighbor",
+                "ResizeBilinear",
+                "Tile",
+                "Transpose",
+                "Mfcc",
+            )
+        ),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+    ),
+    (
+        set(("QuantizedReshape", "QuantizedResizeBilinear")),
+        purpose_from_list(
+            [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
+        ),
+    ),
+    (
+        set(("QuantizedBiasAdd", "QuantizedAdd", "QuantizedMul")),
+        purpose_from_list(
+            [
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+            ]
+        ),
+    ),
+    (
+        set(
+            (
+                "Dequantize",
+                "Quantize",
+                "QuantizeV2",
+                "QuantizedRelu",
+                "QuantizedRelu1",
+                "QuantizedRelu6",
+                "QuantizedAvgPool",
+                "QuantizedMaxPool",
+                "Slice",
+                "SplitV",
+            )
+        ),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+    ),
+    (
+        set(("BatchToSpaceND", "SpaceToBatchND", "DepthToSpaceND", "SpaceToDepthND")),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+    ),
+    (
+        set(("BlockLSTM",)),
+        purpose_from_list(
+            [
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.Weights,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+            ]
+        ),
+    ),
+    (set(("SplitSliceRead",)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])),
+    (set(("Shape", "ConcatSliceWrite", "AudioSpectrogram")), purpose_from_list([TensorPurpose.FeatureMap])),
+    (
+        set(("StridedSlice",)),
+        purpose_from_list(
+            [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
+        ),
+    ),
+    (set(("Fill", "Pack", "Range")), all_parameter),
+    (
+        set(("Requantize",)),
+        purpose_from_list(
+            [
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+                TensorPurpose.FeatureMap,
+            ]
+        ),
+    ),
+    (set(("Placeholder", "SubgraphInput", "Const", "VariableV2")), purpose_from_list([])),
+    (set(("FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars")), input0_from_output_rest_parameter),
+    (
+        set(("Square", "Sqrt", "Log", "Less", "Enter", "Exit", "Identity", "StopGradient", "Merge", "Switch")),
+        inputs_from_output,
+    ),
+    (None, all_fm),
+]
+
+
+for ops, input_purpose in tensor_purposes:
+    if ops is None:
+        continue
+    for op in ops:
+        assert len(op) > 1, "string literal has been decomposed"
+
+
+def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
+    def mark_tensor_helper(tens, purpose):
+
+        if tens.purpose == TensorPurpose.Unknown or tens.purpose == purpose:
+            tens.purpose = purpose
+        else:
+            assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens)
+        tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
+
+        if len(tens.ops) == 1 and tens.ops[0].type == "Const":
+            tens.mem_area = (
+                arch.permanent_storage_mem_area
+            )  # special case constants, as they must be in permanent storage
+
+    def rewrite_mark_tensor_purpose(op, arch):
+        # find disconnected outputs and mark as parameters
+        for tens in op.outputs:
+            if not tens.consumers():
+                mark_tensor_helper(tens, TensorPurpose.FeatureMap)
+
+        for ops, input_purpose in tensor_purposes:
+            if ops is None or op.type in ops:
+                if ops is None:
+                    print(
+                        "warning: don't know how to mark up purpose for",
+                        op.type,
+                        op.inputs,
+                        "triggering all feature map fallback",
+                    )
+                for idx, tens in enumerate(op.inputs):
+                    purpose = input_purpose(op, idx)
+                    mark_tensor_helper(tens, purpose)
+                break
+        return op
+
+    for sg in nng.subgraphs:
+        sg = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [rewrite_mark_tensor_purpose])
+        for tens in sg.output_tensors:
+            mark_tensor_helper(tens, TensorPurpose.FeatureMap)
+
+    if verbose_tensor_purpose:
+        nng.print_graph_with_tensors()
+
+    return nng
+
+
+reshape_operations = set(
+    (
+        "Reshape",
+        "QuantizedReshape",
+        "ExpandDims",
+        "Squeeze",
+        "BatchToSpaceND",
+        "SpaceToBatchND",
+        "DepthToSpaceND",
+        "SpaceToDepthND",
+        "Placeholder",
+    )
+)
+
+
+def mark_tensor_format(nng, arch, verbose_tensor_format=False):
+    formats_for_tensor = {}
+
+    def init_tens(tens):
+        if tens.purpose == TensorPurpose.FeatureMap:
+            fmt = arch.default_feature_map_format
+        elif tens.purpose == TensorPurpose.Weights:
+            fmt = arch.default_weight_format
+        else:
+            assert 0, "unknown tensor purpose %s" % (tens.purpose,)
+        return fmt
+
+    def find_npu_usage_of_tensor(tens):
+        for op in tens.consumers():
+            if op.type == "DMA":
+                return find_npu_usage_of_tensor(op.outputs[0])
+            if "npu_block_type" in op.attrs:
+                return op.attrs["npu_block_type"]
+            return NpuBlockType.Default
+
+    def visit_tens(tens, ps):
+        if not tens in formats_for_tensor:
+            fmt = init_tens(tens)
+        else:
+            fmt = formats_for_tensor[tens]
+
+        formats_for_tensor[tens] = fmt
+
+    for sg in nng.subgraphs:
+        for ps in sg.passes:
+            for tens in ps.outputs:
+                visit_tens(tens, ps)
+            for tens in ps.intermediates:
+                visit_tens(tens, ps)
+            for tens in ps.inputs:
+                visit_tens(tens, ps)
+
+    for tens, fmt in formats_for_tensor.items():
+        tens.set_format(fmt, arch)
+        if fmt == TensorFormat.WeightsCompressed and tens.values is not None:
+            npu_block_type = find_npu_usage_of_tensor(tens)
+            if len(tens.ops) == 1 and tens.ops[0].type == "DMA":
+                weight_compressor.compress_weights(tens, arch, npu_block_type, Block(32, 32, 32), 32)
+                # Alias compressed weights back into source tensor
+                src_tens = tens.ops[0].inputs[0]
+                src_tens.compressed_values = tens.compressed_values
+                src_tens.storage_shape = tens.storage_shape
+                src_tens.brick_size = tens.brick_size
+                src_tens.weight_compression_scales = tens.weight_compression_scales
+                src_tens.weight_compressed_offsets = tens.weight_compressed_offsets
+                src_tens.compression_scale_for_worst_weight_stream = tens.compression_scale_for_worst_weight_stream
+                src_tens.storage_compression_scale = tens.storage_compression_scale
+
+    if verbose_tensor_format:
+        nng.print_passes_with_tensors()
diff --git a/ethosu/vela/model_reader.py b/ethosu/vela/model_reader.py
new file mode 100644
index 0000000..6d7a3a4
--- /dev/null
+++ b/ethosu/vela/model_reader.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Dispatcher for reading a neural network model.
+
+
+class ModelReaderOptions:
+    def __init__(self, batch_size=1):
+        self.batch_size = batch_size
+
+    def __str__(self):
+        return type(self).__name__ + ": " + str(self.__dict__)
+
+    __repr__ = __str__
+
+
+def read_model(fname, options, feed_dict={}, output_node_names=[], initialisation_nodes=[]):
+    if fname.endswith(".tflite"):
+        from . import tflite_reader
+
+        nng = tflite_reader.read_tflite(
+            fname,
+            options.batch_size,
+            feed_dict=feed_dict,
+            output_node_names=output_node_names,
+            initialisation_nodes=initialisation_nodes,
+        )
+    else:
+        assert 0, "Unknown model format"
+    return nng
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
new file mode 100644
index 0000000..8d335bd
--- /dev/null
+++ b/ethosu/vela/nn_graph.py
@@ -0,0 +1,548 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Neural network graph classes and enums.
+# Pass - A packed pass containing one or more Operations.
+# CascadedPass - A scheduled pass containing one or more Passes, as well as a scheduling strategy and block
+#                configurations.
+# Subgraph - Holds a neural network subgraph, pointing at Tensors, Operations, Passes, and CascadedPasses.
+# Graph - A full neural network graph with one or more Subgraphs.
+
+import enum
+from .data_type import BaseType, DataType
+from .tensor import MemArea, TensorPurpose, TensorSubPurpose, TensorFormat, Tensor
+from .operation import Operation, NpuBlockType
+
+
+class PassPlacement(enum.Enum):
+    Unknown = 0
+    Cpu = 1
+    Npu = 2
+    MemoryOnly = 3
+    StartupInit = 4
+
+
+class TensorAllocator(enum.Enum):
+    LinearAlloc = 1
+    Greedy = 2
+
+    def __str__(self):
+        return self.name
+
+
+class Pass:
+    def __init__(self, name, placement, is_element_wise, npu_block_type):
+        self.inputs = []
+        self.intermediates = []
+        self.outputs = []
+        self.ops = []
+        self.primary_op = None
+        self.ifm_tensor = None
+        self.ifm2_tensor = None
+        self.ofm_tensor = None
+        self.weight_tensor = None
+        self.scale_tensor = None
+        self.name = name
+        self.cascade = None
+        self.placement = placement
+
+        # TODO: rename is_element_wise because it is not the same as an ElementWise operator. It is used by the tensor
+        # allocation and requires that the OFM and IFM has the exact same address. Essentially complete overlap.
+        self.is_element_wise = is_element_wise
+        self.npu_block_type = npu_block_type
+        self.block_config = None  # will be filled in by scheduler
+        self.shared_buffer = None  # will be filled in by scheduler
+
+        self.predecessors = []
+        self.successors = []
+
+    def __str__(self):
+        return "<nng.Pass '%s', %s, ops=%s>" % (self.name, self.placement, [op.type for op in self.ops])
+
+    __repr__ = __str__
+
+    def get_primary_op_ifm_weights(self):
+        if not self.primary_op:
+            return None, None
+        return self.primary_op.get_ifm_ifm2_weights_ofm()[::2]
+
+    def get_primary_op_ifm_ifm2_weights_ofm(self):
+        if not self.primary_op:
+            return None, None, None, None
+        return self.primary_op.get_ifm_ifm2_weights_ofm()
+
+    def get_primary_op_ifm_weights_biases_ofm(self):
+        if not self.primary_op:
+            return None, None, None, None
+        return self.primary_op.get_ifm_weights_biases_ofm()
+
+
+class SchedulingStrategy(enum.Enum):
+    Unknown = -1
+    IfmStream = 0
+    WeightStream = 1
+
+
+class SchedulerRewrite(enum.Enum):
+    Nop = 0
+    ChangeTensorSubPurpose = 1
+
+
+class CascadedPass:
+    def __init__(self, name, strat, inputs, intermediates, outputs, passes, placement, is_element_wise):
+        self.name = name
+        self.strategy = strat
+        self.inputs = inputs
+        self.intermediates = intermediates
+        self.outputs = outputs
+        self.passes = passes
+        self.placement = placement
+        self.is_element_wise = is_element_wise
+
+        self.predecessors = []
+        self.successors = []
+
+    def __str__(self):
+        return "<nng.CascadedPass strategy=%s x %s '%s',  passes=%s, block_configs=%s>" % (
+            self.strategy,
+            len(self.passes),
+            self.name,
+            [ps.name for ps in self.passes],
+            [ps.block_config for ps in self.passes],
+        )
+
+    __repr__ = __str__
+
+
+class Subgraph:
+    def __init__(self, name="<unnamed>", placement=PassPlacement.Cpu):
+        self.output_tensors = []
+        self.input_tensors = []
+        self.original_inputs = []  # Preserve the original input order
+        self.passes = []
+        self.cascaded_passes = []
+        self.name = name
+        self.high_level_command_stream = []
+        self.placement = placement
+        self.command_stream_tensor = None
+        self.flash_tensor = None
+
+        self.memory_used = {}
+
+    def __str__(self):
+        return "<nng.Subgraph '%s',  n_passes=%d, n_cascaded_passes=%d>" % (
+            self.name,
+            len(self.passes),
+            len(self.cascaded_passes),
+        )
+
+    __repr__ = __str__
+
+    def update_consumers(self):
+        visit_op_set = set()
+        visit_tensor_set = set()
+        self.input_tensors = []
+
+        print_visit = False
+
+        def visit_op(op):
+            if op in visit_op_set:
+                return
+
+            visit_op_set.add(op)
+            for inp in op.inputs:
+                if print_visit:
+                    print(inp, "adding consumer", op)
+                visit_tensor(inp)
+                inp.consumer_list.append(op)
+
+            if op.type in set(("Placeholder", "SubgraphInput")):
+                assert len(op.outputs) == 1
+                self.input_tensors.append(op.outputs[0])
+
+            for out in op.outputs:
+                if out not in visit_tensor_set:
+                    out.consumer_list = []  # reset unvisited output, just in case
+
+        def visit_tensor(tens):
+            if tens in visit_tensor_set:
+                return
+            visit_tensor_set.add(tens)
+            tens.consumer_list = []
+            for op in tens.ops:
+                visit_op(op)
+
+        for ps in self.passes:
+            for tens in ps.outputs + ps.inputs:
+                tens.consumer_list = []  # reset unvisited tensors to start with
+
+        for tens in self.output_tensors:
+            visit_tensor(tens)
+            tens.consumer_list.append(None)  # special op to indicate that the graph consumes the result
+
+        print_visit = True
+        for ps in self.passes:
+            for op in ps.ops:
+                visit_op(op)
+            for tens in ps.inputs:
+                visit_tensor(tens)
+
+    def build_pass_links(self):
+        for idx, ps in enumerate(self.passes):
+            ps.time = 2 * idx
+            ps.predecessors = []
+            ps.successors = []
+
+        for ps in self.passes:
+            for tens in ps.inputs:
+                for op in tens.ops:
+                    pred_pass = op.scheduled_pass
+                    assert pred_pass.time < ps.time
+                    if ps not in pred_pass.successors:
+                        pred_pass.successors.append(ps)
+
+                    if pred_pass not in ps.predecessors:
+                        ps.predecessors.append(pred_pass)
+
+                    assert tens in pred_pass.outputs
+
+    def build_pass_dag_predecessors(self):
+        for ps in self.passes:
+            ps.dag_predecessors = []
+
+        class State(enum.Enum):
+            NotVisited = 0
+            BeingVisited = 1
+            Visited = 2
+
+        pass_visit_dict = {}
+
+        def visit_pass(ps):
+            state = pass_visit_dict.get(ps, State.NotVisited)
+            if state == State.Visited:
+                return True
+            elif state == State.BeingVisited:
+                return False  # this is a loop, need to remove this link
+            elif state == State.NotVisited:
+                pass_visit_dict[ps] = State.BeingVisited
+
+                ps.dag_predecessors = []
+                for pred in ps.predecessors:
+                    if visit_pass(pred):
+                        ps.dag_predecessors.append(pred)
+
+                pass_visit_dict[ps] = State.Visited
+                return True
+
+        for ps in self.passes:
+            if not ps.successors:
+                visit_pass(ps)
+
+    def build_cascaded_pass_links(self):
+        for cps in self.cascaded_passes:
+            cps.predecessors = []
+            cps.successors = []
+
+        for cps in self.cascaded_passes:
+            for tens in cps.inputs:
+                for op in tens.ops:
+                    pred_cpass = op.scheduled_pass.cascade
+                    if cps not in pred_cpass.successors:
+                        pred_cpass.successors.append(cps)
+
+                    if pred_cpass not in cps.predecessors:
+                        cps.predecessors.append(pred_cpass)
+
+                    assert tens in pred_cpass.outputs
+
+    def refresh_after_modification(self):
+        self.update_consumers()
+
+    def prune_startup_init_pass(self):
+        assert len(self.passes) >= 1
+        ps = self.passes[0]
+        assert ps.placement == PassPlacement.StartupInit
+
+        ps.outputs = [out_tens for out_tens in ps.outputs if len(out_tens.consumers()) > 0]
+        ps.ops = [op for op in ps.ops if op.outputs[0] in ps.outputs]
+
+    def get_all_ops(self):
+        all_ops = []
+        visit_op_set = set()
+        visit_tensor_set = set()
+
+        def visit_op(op):
+            if op in visit_op_set:
+                return
+            visit_op_set.add(op)
+            for inp in op.inputs:
+                visit_tensor(inp)
+
+            all_ops.append(op)
+
+        def visit_tensor(tens):
+            if tens in visit_tensor_set:
+                return
+            visit_tensor_set.add(tens)
+            for op in tens.ops:
+                visit_op(op)
+
+        for tens in self.output_tensors:
+            visit_tensor(tens)
+
+        return all_ops
+
+    def print_operators(self):
+        all_ops = self.get_all_ops()
+        unique_ops = []
+        print("print_operators")
+        for op in all_ops:
+            if op.type in set(("Const", "Identity", "Placeholder")):
+                continue
+
+            attrs = op.attrs
+            if (
+                op.type == "Conv2D"
+                or op.type == "DepthwiseConv2dNative"
+                or op.type == "Conv2DBiasAct"
+                or op.type == "DepthwiseConv2dBiasAct"
+            ):
+                kshape = op.inputs[1].shape
+                attrs["kshape"] = [kshape[0], kshape[1]]
+            attrs["type"] = op.type
+            attrs.pop("use_cudnn_on_gpu", None)
+            if attrs not in unique_ops:
+                unique_ops.append(attrs)
+                # print attributes in human readable format
+                a = attrs.copy()
+                s = a.pop("type")
+                data_format = a.pop("data_format", None)
+                if data_format and data_format != b"NHWC":
+                    s += " " + str(data_format)
+                t = a.pop("T", None)
+                if t:
+                    s += " " + str(t)[9:-2]
+                srct = a.pop("SrcT", None)
+                if srct:
+                    s += " " + str(srct)[9:-2]
+                dstt = a.pop("DstT", None)
+                if dstt:
+                    s += "->" + str(dstt)[9:-2]
+                print(s + " " + str(a))
+
+    def print_graph(self):
+        all_ops = self.get_all_ops()
+        for idx, op in enumerate(all_ops):
+            print(idx, op.type, op.name)
+
+    def print_graph_with_tensors(self):
+        all_ops = self.get_all_ops()
+        for idx, op in enumerate(all_ops):
+            print(idx, op.type, op.name)
+            for idx, tens in enumerate(op.inputs):
+                print("    Input  %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+            for idx, tens in enumerate(op.outputs):
+                print("    Output %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+            print()
+
+    def print_graph_with_tensor_quantization(self):
+        all_ops = self.get_all_ops()
+        for idx, op in enumerate(all_ops):
+            print(idx, op.type, op.name)
+            for idx, tens in enumerate(op.inputs):
+                q = tens.quantization
+                if q is None:
+                    print("    Input  %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name))
+                else:
+                    print(
+                        "    Input  %02d %10s min=%s max=%s scale=%s zero_point=%s %s"
+                        % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name)
+                    )
+            for idx, tens in enumerate(op.outputs):
+                q = tens.quantization
+                if q is None:
+                    print("    Output %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name))
+                else:
+                    print(
+                        "    Output %02d %10s min=%s max=%s scale=%s zero_point=%s %s"
+                        % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name)
+                    )
+            print()
+
+    def print_passes(self):
+        for idx, ps in enumerate(self.passes):
+            print("%03d %s" % (idx * 2, ps))
+
+    def print_passes_with_tensors(self):
+        for idx, ps in enumerate(self.passes):
+            print("%3d %s" % (idx * 2, ps))
+            for idx, tens in enumerate(ps.inputs):
+                print(
+                    "    Input        %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            for idx, tens in enumerate(ps.intermediates):
+                print(
+                    "    Intermediate %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            for idx, tens in enumerate(ps.outputs):
+                print(
+                    "    Output       %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            print()
+
+    def print_cascaded_passes(self):
+        for idx, ps in enumerate(self.cascaded_passes):
+            print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+
+    def print_cascaded_passes_with_tensors(self):
+        for idx, ps in enumerate(self.cascaded_passes):
+            print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+            for idx, tens in enumerate(ps.inputs):
+                print(
+                    "    Input        %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            for idx, tens in enumerate(ps.intermediates):
+                print(
+                    "    Intermediate %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            for idx, tens in enumerate(ps.outputs):
+                print(
+                    "    Output       %2d %-15s %-15s %-15s     %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+                )
+            print()
+
+    def print_cascaded_passes_with_tensor_sizes(self):
+        for idx, ps in enumerate(self.cascaded_passes):
+            print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+            for idx, tens in enumerate(ps.inputs):
+                print(
+                    "    Input        %2d %7.1f KB %-24s %-15s %-15s %-20s  %s"
+                    % (
+                        idx,
+                        tens.storage_size() / 1024,
+                        tens.storage_shape,
+                        tens.mem_area.name,
+                        tens.purpose.name,
+                        tens.format.name,
+                        tens.name,
+                    )
+                )
+            for idx, tens in enumerate(ps.intermediates):
+                print(
+                    "    Intermediate %2d %7.1f KB %-24s %-15s %-15s %-20s  %s"
+                    % (
+                        idx,
+                        tens.storage_size() / 1024,
+                        tens.storage_shape,
+                        tens.mem_area.name,
+                        tens.purpose.name,
+                        tens.format.name,
+                        tens.name,
+                    )
+                )
+            for idx, tens in enumerate(ps.outputs):
+                print(
+                    "    Output       %2d %7.1f KB %-24s %-15s %-15s %-20s  %s"
+                    % (
+                        idx,
+                        tens.storage_size() / 1024,
+                        tens.storage_shape,
+                        tens.mem_area.name,
+                        tens.purpose.name,
+                        tens.format.name,
+                        tens.name,
+                    )
+                )
+            print()
+
+    def print_high_level_command_stream(self):
+        for idx, cmd in enumerate(self.high_level_command_stream):
+            print("%3d %s" % (idx, cmd))
+
+
+class Graph:
+    def __init__(self, name="<unnamed>", batch_size=1):
+        self.name = name
+        self.batch_size = batch_size
+        self.subgraphs = []
+
+        self.memory_used = {}
+        self.bits_per_element = {}
+        self.total_size = {}
+        self.total_elements = {}
+
+    def get_root_subgraph(self):
+        return self.subgraphs[0]
+
+    def prune_startup_init_pass(self):
+        for sg in self.subgraphs:
+            sg.prune_startup_init_pass()
+
+    def update_consumers(self):
+        for sg in self.subgraphs:
+            sg.update_consumers()
+
+    def refresh_after_modification(self):
+        for sg in self.subgraphs:
+            sg.refresh_after_modification()
+
+    def print_operators(self):
+        for sg in self.subgraphs:
+            sg.print_operators()
+
+    def print_graph(self):
+        for sg in self.subgraphs:
+            sg.print_graph()
+
+    def print_graph_with_tensors(self):
+        for sg in self.subgraphs:
+            sg.print_graph_with_tensors()
+
+    def print_graph_with_tensor_quantization(self):
+        for sg in self.subgraphs:
+            sg.print_graph_with_tensor_quantization()
+
+    def print_passes(self):
+        for sg in self.subgraphs:
+            sg.print_passes()
+
+    def print_passes_with_tensors(self):
+        for sg in self.subgraphs:
+            sg.print_passes_with_tensors()
+
+    def print_cascaded_passes(self):
+        for sg in self.subgraphs:
+            sg.print_cascaded_passes()
+
+    def print_cascaded_passes_with_tensors(self):
+        for sg in self.subgraphs:
+            sg.print_cascaded_passes_with_tensors()
+
+    def print_cascaded_passes_with_tensor_sizes(self):
+        for sg in self.subgraphs:
+            sg.print_cascaded_passes_with_tensor_sizes()
+
+    def print_high_level_command_stream(self):
+        for sg in self.subgraphs:
+            sg.print_high_level_command_stream()
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
new file mode 100644
index 0000000..84cc493
--- /dev/null
+++ b/ethosu/vela/npu_performance.py
@@ -0,0 +1,516 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
+# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
+#
+# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
+# estimate.
+
+import enum
+from . import numeric_util
+import numpy as np
+from .tensor import TensorPurpose, MemArea, TensorFormat, shape_num_elements, Tensor, TensorBlockTraversal
+from .operation import Operation
+from .data_type import DataType, BaseType
+from .nn_graph import PassPlacement, NpuBlockType, SchedulerRewrite, Pass
+from .architecture_features import Block, Kernel
+
+
+def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
+    ps2_strides = (1, 1, 1, 1)
+    ps2_dilation = (1, 1, 1, 1)
+    for op in ps2.ops:
+        if "strides" in op.attrs:
+            ps2_strides = op.attrs["strides"]
+        if "dilation" in op.attrs:
+            ps2_dilation = op.attrs["dilation"]
+
+    ifm_idx, _, weight_idx, _, _ = op.get_ifm_ifm2_weight_bias_ofm_indices()
+
+    rolling_buffer_sizes = []
+
+    weight_tensor = op.inputs[weight_idx]
+
+    ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
+    kernel = Kernel(
+        weight_tensor.shape[1], weight_tensor.shape[0], ps2_strides[2], ps2_strides[1], ps2_dilation[2], ps2_dilation[1]
+    )
+    kernel_block = Block(weight_tensor.shape[1], weight_tensor.shape[0], 65536)
+
+    if ps2.npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+        ifm_block_depth = arch.calc_ifm_block_depth(
+            op.inputs[ifm_idx].shape[-1], op.inputs[ifm_idx].dtype.size_in_bits()
+        )
+    else:
+        ifm_block_depth = block_config_ps2[-1]
+
+    ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, kernel_block)
+
+    # The performed height calculation is for worst case
+    height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
+    width = ifm_block.width
+
+    rolling_buffer_sizes.append(height)
+    rolling_buffer_sizes.append(width)
+
+    return rolling_buffer_sizes
+
+
+class PassCycles(enum.IntEnum):
+    Dpu = 0
+    ElementWise = 1
+    Cpu = 2
+    SramAccess = 3
+    TotalPerPass = 4
+    DramAccess = 5
+    OnChipFlashAccess = 6
+    OffChipFlashAccess = 7
+    Total = 8
+    Size = 9
+
+    def display_name(self):
+        return (
+            "DPU",
+            "Element wise",
+            "CPU",
+            "SRAM Access",
+            "Total per Pass",
+            "DRAM Access",
+            "On-chip Flash Access",
+            "Off-chip Flash Access",
+            "Total",
+            "Size",
+        )[self.value]
+
+    def identifier_name(self):
+        return (
+            "dpu",
+            "element_wise",
+            "cpu",
+            "sram_access",
+            "total_per_pass",
+            "dram_access",
+            "on_chip_flash_access",
+            "off_chip_flash_access",
+            "total",
+            "size",
+        )[self.value]
+
+    @staticmethod
+    def all():
+        return (
+            PassCycles.Dpu,
+            PassCycles.ElementWise,
+            PassCycles.Cpu,
+            PassCycles.SramAccess,
+            PassCycles.DramAccess,
+            PassCycles.OnChipFlashAccess,
+            PassCycles.OffChipFlashAccess,
+            PassCycles.Total,
+        )
+
+
+class MacCount(enum.IntEnum):
+    NeuralNetworkMacs = 0
+    HardwareMacs = 1
+    Size = 2
+
+    def display_name(self):
+        return ("Neural Network Macs", "Hardware Macs", "Size")[self.value]
+
+    def identifier_name(self):
+        return ("nn_macs", "hardware_macs", "size")[self.value]
+
+    @staticmethod
+    def all():
+        return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs)
+
+
+class BandwidthDirection(enum.IntEnum):
+    Read = 0
+    Write = 1
+    Size = 2
+
+    def display_name(self):
+        return self.name
+
+    def identifier_name(self):
+        return self.name.lower()
+
+    @staticmethod
+    def all():
+        return (BandwidthDirection.Read, BandwidthDirection.Write)
+
+
+def make_bandwidth_array():
+    return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
+
+
+def make_macs_array():
+    return np.zeros(MacCount.Size, np.int)
+
+
+def make_cycles_array():
+    return np.zeros(PassCycles.Size)
+
+
+def make_metrics_arrays():
+    return (make_bandwidth_array(), make_macs_array(), make_cycles_array())
+
+
+def get_n_blocks_and_area(
+    ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides
+):
+
+    ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2])
+
+    n_normal_blocks = []
+    remainder_size = []
+    for i in range(2):
+        non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i]
+        n_blocks = non_skirt_dim // ifm_block_config[i]
+        n_normal_blocks.append(n_blocks)
+        remainder_dim = numeric_util.round_up(
+            ((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i]
+        )
+        remainder_size.append(remainder_dim)
+
+    # this will actually calculate reads into the edge padding.
+
+    # there are four cases in total, handling the edges that will not fill a complete block.
+
+    # 0000000001
+    # 0000000001
+    # 0000000001
+    # 0000000001
+    # 0000000001
+    # 0000000001
+    # 2222222223
+    total_blocks = 0
+    total_area = 0
+
+    block_setup = (
+        (n_normal_blocks[0] * n_normal_blocks[1], block_config),
+        (1 * n_normal_blocks[1], (remainder_size[0], block_config[1])),
+        (n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])),
+        (1 * 1, remainder_size),
+    )
+
+    for n_blocks, block_size in block_setup:
+        if block_size[0] == 0 or block_size[1] == 0:
+            continue
+        read_dims = [0, 0]
+        for i in range(2):
+            read_dims[i] = (
+                numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1])
+                + block_size[i] * strides[i + 1]
+                + numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1])
+            )
+        assert n_blocks >= 0
+        total_blocks += n_blocks
+        total_area += n_blocks * read_dims[0] * read_dims[1]
+    assert total_blocks >= 1
+    return total_blocks, total_area, block_setup
+
+
+def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
+    if block_config is None:
+        block_config = ps.block_config
+    bws = make_bandwidth_array()
+    macs = make_macs_array()
+    cycles = make_cycles_array()
+    blocks = 0
+    ifm_read_multiple = 1
+    weight_read_multiple = 0
+
+    if ps.placement in set((PassPlacement.MemoryOnly, PassPlacement.StartupInit)):
+        return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple  # nothing real happening in this pass
+
+    min_block_size = arch.min_block_sizes[ps.npu_block_type]
+
+    skirt = (0, 0, 0, 0)
+    explicit_padding = (0, 0, 0, 0)
+    primary_op = ps.primary_op
+    replacement_read_bws = {}
+    if primary_op:
+        skirt = primary_op.attrs.get("skirt", skirt)
+        explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
+        assert primary_op.attrs["npu_block_type"] == ps.npu_block_type
+        npu_block_type = primary_op.attrs["npu_block_type"]
+
+        ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
+
+        npu_convolution_ops = set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise))
+        if (npu_block_type == NpuBlockType.Pooling and len(ifm_tensor.shape) == 4) or (
+            npu_block_type in npu_convolution_ops
+        ):
+
+            batch_size = ifm_tensor.shape[0]
+            ifm_tensor_shape = list(ifm_tensor.shape)
+            ifm_depth = ifm_tensor.bandwidth_shape[3]
+
+            # add in padding
+            ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2]  # height += top and bottom
+            ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3]  # width  += left and right
+
+            strides = primary_op.attrs["strides"]
+            if npu_block_type != NpuBlockType.Pooling:
+                weight_tensor_shape = weight_tensor.shape
+                weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape
+                weight_tensor_element_size = weight_tensor.element_size()
+                weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
+                nn_ops = (
+                    int(ofm_tensor.shape[0])
+                    * int(ofm_tensor.shape[1])
+                    * int(ofm_tensor.shape[2])
+                    * int(weight_tensor_shape[0])
+                    * int(weight_tensor_shape[1])
+                    * int(weight_tensor_shape[2])
+                    * int(weight_tensor_shape[3])
+                    / int(strides[1])
+                    / int(strides[2])
+                )
+            else:
+                weight_tensor_shape = [
+                    primary_op.attrs["ksize"][1],
+                    primary_op.attrs["ksize"][2],
+                    1,
+                    ifm_tensor_shape[3],
+                ]
+                weight_tensor_bandwidth_shape = weight_tensor_shape
+                weight_tensor_element_size = 0
+                weight_tensor_bandwidth_compression_scale = 0.0
+                nn_ops = 0  # pooling doesn't count as NN ops
+
+            kernel_dims = weight_tensor_shape[:2]
+
+            sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
+            # count the sub kernels; the IFM block needs to be refetched for each of them
+            n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
+            n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
+            n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
+
+            clamped_skirt = list(skirt)
+            clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0])
+            clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1])
+            n_blocks, area, block_setup = get_n_blocks_and_area(
+                ifm_tensor.brick_size,
+                ifm_tensor_shape[1:3],
+                skirt,
+                clamped_skirt,
+                block_config,
+                min_block_size,
+                strides,
+            )
+
+            blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], block_config[3])
+
+            n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], block_config[3])
+            if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling:
+                n_weight_stages = 1  # force to no reread
+
+            ifm_tensor_bw = (
+                n_sub_kernels
+                * batch_size
+                * area
+                * ifm_depth
+                * n_weight_stages
+                * ifm_tensor.element_size()
+                * ifm_tensor.bandwidth_compression_scale
+            )
+            replacement_read_bws[ifm_tensor] = ifm_tensor_bw
+            ifm_read_multiple = n_weight_stages
+
+            replacement_read_bws[weight_tensor] = (
+                batch_size
+                * shape_num_elements(weight_tensor_bandwidth_shape)
+                * weight_tensor_element_size
+                * weight_tensor_bandwidth_compression_scale
+                * n_blocks
+            )  # read once per block and batch
+            weight_read_multiple = n_blocks
+
+            n_kernel_xy = kernel_dims[0] * kernel_dims[1]
+            n_input_channels_at_a_time = block_config[2]
+
+            if npu_block_type == NpuBlockType.Pooling or weight_tensor.block_traversal in set(
+                (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise)
+            ):
+                n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4)
+                n_kernel_xy = max(
+                    n_kernel_xy, 4
+                )  # need at least 4, as this is the minimum duty cycle for secondary accumulator writes
+                if weight_tensor is not None:
+                    n_kernel_xy = numeric_util.round_up(
+                        n_kernel_xy, 4
+                    )  # weights need to be read in blocks of 4
+
+            num_mac_ops = 0
+            for n_blocks_for_size, block_size in block_setup:
+                num_mac_ops += (
+                    batch_size
+                    * n_blocks_for_size
+                    * block_size[0]
+                    * block_size[1]
+                    * numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time)
+                    * numeric_util.round_up(weight_tensor_shape[3], block_config[3])
+                    * n_kernel_xy
+                )
+
+            if npu_block_type == NpuBlockType.Pooling:
+                # TODO: improve pooling estimation
+                cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle / 2
+            else:
+                cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
+            macs[MacCount.NeuralNetworkMacs] += nn_ops
+            macs[MacCount.HardwareMacs] += num_mac_ops
+
+        elif npu_block_type == NpuBlockType.VectorProduct:
+            nn_macs = (
+                ifm_tensor.shape[0]
+                * numeric_util.round_up(weight_tensor.shape[-2], block_config[2])
+                * numeric_util.round_up(weight_tensor.shape[-1], block_config[3])
+            )
+            num_mac_ops = nn_macs
+
+            cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
+            macs[MacCount.NeuralNetworkMacs] += nn_macs
+            macs[MacCount.HardwareMacs] += num_mac_ops
+
+            blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], block_config[3])
+
+            non_zero_fraction = 1.0
+            if ifm_tensor.values is not None:
+                nz_vector = np.amax(ifm_tensor.values != 0, axis=0)  # max across batch axis
+                non_zero_fraction = np.average(nz_vector)
+
+            replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth()
+            replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction
+            ifm_read_multiple = 1
+            weight_read_multiple = non_zero_fraction
+    else:
+        if ps.placement == PassPlacement.Npu and len(ps.outputs):
+            # Assume element-wise operation going through the element pipelines.
+            # Work out how many elements we have and calculate performance.
+            out = ps.outputs[0]
+            elms = out.elements()
+
+            cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units)
+
+    if ps.placement == PassPlacement.Cpu:
+        cycles[PassCycles.Cpu] = arch.cpu_cycle_estimate(ps.ops[0])
+
+    # apply the desired rewrites
+    for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
+        if ps != ps_to_rewrite:
+            continue
+        if rewrite_op == SchedulerRewrite.Nop:
+            pass  # these are fine, no bandwidth changes
+        elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
+            bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
+            replacement_read_bws[tens] = 0
+
+    for tens in ps.outputs:
+        if force_outputs_to_fast_storage:
+            bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+        else:
+            bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+
+    for tens in ps.intermediates:
+        bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+
+        if tens in replacement_read_bws:
+            bw = replacement_read_bws[tens]
+        else:
+            bw = tens.bandwidth()
+
+        bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+
+    for tens in ps.inputs:
+        if tens in replacement_read_bws:
+            bw = replacement_read_bws[tens]
+        else:
+            bw = tens.bandwidth()
+
+        bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+
+    cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
+    cycles[PassCycles.TotalPerPass] = np.max(cycles[: PassCycles.TotalPerPass])
+
+    # quick build access counts for only current pass, even though these aren't the final numbers
+    update_summary_cycles(arch, bws, macs, cycles)
+
+    return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple
+
+
+def update_summary_cycles(arch, bws, macs, cycles):
+    cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
+    cycles[PassCycles.OnChipFlashAccess] = (
+        np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
+    )
+    cycles[PassCycles.OffChipFlashAccess] = (
+        np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
+    )
+
+    cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
+    return cycles
+
+
+def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
+    return bws, macs, cycles
+
+
+def performance_for_cascaded_pass(arch, cps):
+    total_bws = make_bandwidth_array()
+    total_macs = make_macs_array()
+    total_cycles = make_cycles_array()
+
+    for ps in cps.passes:
+        bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps)
+        ps.bandwidths = bws
+        ps.macs = macs
+        ps.cycles = cycles
+        ps.n_blocks = blocks
+        total_bws += bws
+        total_macs += macs
+        total_cycles += cycles
+
+    bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
+    cps.bandwidths = bws
+    cps.macs = macs
+    cps.cycles = cycles
+    return bws, macs, cycles
+
+
+def calc_performance_for_network(nng, arch):
+    total_bws = make_bandwidth_array()
+    total_macs = np.zeros(MacCount.Size)
+    total_cycles = np.zeros(PassCycles.Size)
+
+    for sg in nng.subgraphs:
+        for cps in sg.cascaded_passes:
+            bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
+            total_bws += bws
+            total_macs += macs
+            total_cycles += cycles
+            total_cycles += arch.inter_pass_cycle_delay
+
+    nng.bandwidths = total_bws
+    nng.macs = total_macs
+    nng.cycles = total_cycles
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
new file mode 100644
index 0000000..4542c25
--- /dev/null
+++ b/ethosu/vela/npu_serialisation.py
@@ -0,0 +1,145 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Serialises and packs an NPU subgraph into tensors.
+
+from .nn_graph import PassPlacement
+from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat
+from .operation import Operation
+from .data_type import DataType
+import numpy as np
+from . import driver_actions
+import struct
+
+
+def make_memory_tensor(name, mem_area, sz, want_values, arch):
+    tens = Tensor([sz], DataType.uint8, name)
+    tens.mem_area = mem_area
+    tens.purpose = TensorPurpose.FeatureMap
+    tens.set_format(TensorFormat.NHWC, arch)
+    if want_values:
+        tens.values = np.zeros(tens.shape, np.uint8)
+    return tens
+
+
+def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
+    start_addr = src_tensor.address
+    for compressed_values in src_tensor.compressed_values:
+        end_addr = start_addr + len(compressed_values)
+        memory_tensor.values[start_addr:end_addr] = compressed_values
+        start_addr = end_addr
+
+
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+    if sg.placement != PassPlacement.Npu:
+        return scratch_tens, flash_tens
+
+    flash_area = arch.permanent_storage_mem_area
+    scratch_area = MemArea.Sram
+
+    flash_size = sg.memory_used.get(flash_area, 0)
+    scratch_size = sg.memory_used.get(scratch_area, 0)
+
+    # Prepare driver actions for this command tensor
+    da_list = []
+    driver_actions.emit_fourcc(da_list, "COP1")
+    driver_actions.emit_config(da_list, 0, 1, arch)
+    driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
+
+    # Append command stream words
+    da_list.extend(sg.register_command_stream)
+
+    # Convert to bytes
+    payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
+
+    command_stream_size_bytes = len(payload_bytes)
+
+    # Adjust the bits per element calculation to exclude metadata generated by Vela
+    nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
+    nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
+    nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
+    nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
+
+    if flash_tens == scratch_tens == None:
+        # First Npu subgraph, create scratch and flash tensors
+        sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
+        sg.scratch_tensor.purpose = TensorPurpose.Scratch
+        sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
+    else:
+        sg.scratch_tensor = scratch_tens
+        sg.scratch_tensor.shape[0] += scratch_size
+        sg.flash_tensor = flash_tens
+        sg.flash_tensor.shape[0] += flash_size
+
+    for cps in sg.cascaded_passes:
+        for ps in cps.passes:
+            if ps.placement == PassPlacement.Npu and ps.weight_tensor != None:
+                # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+                # is pointing at the destination address of where the weights should be placed in SRAM.
+                # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+                if ps.weight_tensor.ops[0].type == "DMA":
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+                else:
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+
+                copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+    sg.command_stream_tensor = make_memory_tensor(
+        sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
+    )
+    sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
+
+    return sg.scratch_tensor, sg.flash_tensor
+
+
+def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
+    op = Operation("Const", tens.name + "_const")
+    op.outputs = [tens]
+    tens.ops = [op]
+    startup_cps.passes[0].ops.insert(0, op)
+    startup_cps.passes[0].outputs.insert(0, tens)
+    startup_cps.outputs.insert(0, tens)
+
+
+def rewrite_npu_call_ops(nng, sg, arch):
+    if sg.placement != PassPlacement.Cpu:
+        return
+
+    startup_cps = sg.cascaded_passes[0]
+
+    for idx, cps in enumerate(sg.cascaded_passes):
+        for ps in cps.passes:
+            for op in ps.ops:
+                if op.type == "NpuOp":
+                    callee = op.attrs["subgraph"]
+                    op.attrs["custom_options"] = {"type": op.type}
+
+                    sz = 0
+                    for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+                        op.inputs.insert(0, tens)
+                        ps.inputs.insert(0, tens)
+                        cps.inputs.insert(0, tens)
+                        if tens != callee.scratch_tensor:
+                            add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
+                        sz += tens.storage_size()
+
+                    for prev_cps in sg.cascaded_passes[: idx + 1]:
+                        prev_cps.sram_used += sz
+
+                    if callee.scratch_tensor is not None:
+                        cps.sram_used += callee.scratch_tensor.storage_size()
diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py
new file mode 100644
index 0000000..e5bc88b
--- /dev/null
+++ b/ethosu/vela/numeric_util.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Numerical utilities for various types of rounding etc.
+
+import math
+import numpy as np
+
+
+def round_up(a, b):
+    return ((a + b - 1) // b) * b
+
+
+def round_up_divide(a, b):
+    return (a + b - 1) // b
+
+
+def round_up_to_int(v):
+    return int(math.ceil(v))
+
+
+def round_down_to_power_of_two(v):
+    assert v > 0
+    while v & (v - 1):
+        v &= v - 1
+
+    return v
+
+
+def round_up_to_power_of_two(v):
+    return round_down_to_power_of_two(2 * v - 1)
+
+
+def round_down_log2(v):
+    return int(math.floor(np.log2(v)))
+
+
+def round_up_log2(v):
+    return int(math.ceil(np.log2(v)))
+
+
+def round_to_int(v):
+    return np.rint(v).astype(np.int64)
+
+
+# Performs rounding away from zero.
+# n.b. This is identical to C++11 std::round()
+def round_away_zero(f):
+    r = -0.5 if (f < 0) else 0.5
+    return np.trunc(f + r)
+
+
+def quantise_float32(f, scale=1.0, zero_point=0):
+    return zero_point + int(round_away_zero(np.float32(f) / np.float32(scale)))
+
+
+def clamp_tanh(x):
+    if x <= -4:
+        y = -1.0
+    elif x >= 4:
+        y = 1.0
+    else:
+        y = math.tanh(x)
+    return y
+
+
+def clamp_sigmoid(x):
+    if x <= -8:
+        y = 0.0
+    elif x >= 8:
+        y = 1.0
+    else:
+        y = 1 / (1 + math.exp(-x))
+    return y
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
new file mode 100644
index 0000000..d2f2806
--- /dev/null
+++ b/ethosu/vela/operation.py
@@ -0,0 +1,285 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Internal representation of a Neural Network Operation.
+
+import enum
+
+
+class NpuBlockType(enum.Enum):
+    Default = 0
+    ConvolutionMxN = 1
+    VectorProduct = 2
+    Pooling = 3
+    ConvolutionDepthWise = 4
+    ElementWise = 5
+
+
+class Operation:
+    """Class representing a Neural Network operation. Has a name, a type,
+input and output tensors, as well as an attribute dictionary."""
+
+    __slots__ = "type", "name", "attrs", "inputs", "outputs", "flops", "scheduled_pass", "run_on_npu"
+
+    def __init__(self, op_type, name):
+        self.type = op_type
+        self.name = name
+        self.attrs = {}
+        self.inputs = []
+        self.outputs = []
+        self.flops = 0
+        self.run_on_npu = True
+        self.scheduled_pass = None
+
+    def clone(self, suffix="_clone"):
+        res = Operation(self.type, self.name + suffix)
+
+        res.attrs = dict(self.attrs)
+        res.inputs = list(self.inputs)
+        res.outputs = list(self.outputs)
+        res.flops = self.flops
+        res.scheduled_pass = self.scheduled_pass
+
+        return res
+
+    def __str__(self):
+        return "<nng.Operation '%s' type=%s>" % (self.name, self.type)
+
+    __repr__ = __str__
+
+    def get_ifm_ifm2_weight_bias_ofm_indices(self):
+        ifm_idx = -1
+        ifm2_idx = -1
+        weight_idx = -1
+        bias_idx = -1
+        ofm_idx = -1
+        npu_block_type = self.attrs.get("npu_block_type", NpuBlockType.Default)
+        if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)):
+            ifm_idx = 0
+            weight_idx = 1
+            ofm_idx = 0
+
+            if self.type in set(("Conv2DBiasAct", "DepthwiseConv2dBiasAct", "TransposeConvAct")):
+                if len(self.inputs) >= 3:
+                    bias_idx = 2
+
+        elif npu_block_type == NpuBlockType.Pooling:
+            ifm_idx = 0
+            ofm_idx = 0
+        elif npu_block_type == NpuBlockType.VectorProduct:
+            ifm_idx = 0
+            weight_idx = 1
+            ofm_idx = 0
+
+            if self.type in set(("FullyConnectedAct",)):
+                if len(self.inputs) >= 3:
+                    bias_idx = 2
+
+            if self.type == "BlockLSTM":
+                ifm_idx = 3
+                weight_idx = 4
+                ofm_idx = 6
+
+        elif npu_block_type == NpuBlockType.ElementWise:
+            ifm_idx = 0
+            ifm2_idx = 1
+            ofm_idx = 0
+
+            # LeakyRelu and Abs have a single IFM
+            if self.type in set(("LeakyRelu", "Abs")):
+                ifm2_idx = -1
+
+        elif self.type == "Conv2DBackpropInput":
+            ifm_idx = 2
+            weight_idx = 1
+            ofm_idx = 0
+
+        elif self.type in set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")):
+            ifm_idx = 0
+            ofm_idx = 0
+
+        elif self.is_split_op():
+            ifm_idx = 0
+            ofm_idx = 0
+            if self.type == "Split":
+                ifm_idx = 1
+
+        elif self.is_concat_op():
+            ifms, _ = self.get_concat_inputs_axis()
+            ifm_idx = self.inputs.index(ifms[0])
+            if len(ifms) > 1:
+                ifm2_idx = self.inputs.index(ifms[1])
+            ofm_idx = 0
+
+        return ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx
+
+    def get_ifm_ifm2_weights_ofm(self):
+        ifm_tensor = None
+        ifm2_tensor = None
+        weight_tensor = None
+        ofm_tensor = None
+
+        ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
+        if ifm_idx != -1:
+            ifm_tensor = self.inputs[ifm_idx]
+        if ifm2_idx != -1:
+            ifm2_tensor = self.inputs[ifm2_idx]
+        if weight_idx != -1:
+            weight_tensor = self.inputs[weight_idx]
+        if ofm_idx != -1:
+            ofm_tensor = self.outputs[ofm_idx]
+
+        return ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor
+
+    def get_ifm_weights_biases_ofm(self):
+        ifm_tensor = None
+        weight_tensor = None
+        bias_tensor = None
+        ofm_tensor = None
+
+        ifm_idx, _, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
+        if ifm_idx != -1:
+            ifm_tensor = self.inputs[ifm_idx]
+        if weight_idx != -1:
+            weight_tensor = self.inputs[weight_idx]
+        if bias_idx != -1:
+            bias_tensor = self.inputs[bias_idx]
+        if ofm_idx != -1:
+            ofm_tensor = self.outputs[ofm_idx]
+
+        return ifm_tensor, weight_tensor, bias_tensor, ofm_tensor
+
+    concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped"))
+
+    def is_concat_op(self):
+        return self.type in Operation.concat_ops
+
+    def get_concat_inputs_axis(self):
+        assert self.is_concat_op()
+
+        if self.type == "ConcatV2":
+            axis_tensor = self.inputs[-1]
+            inputs = self.inputs[:-1]
+        elif self.type == "Concat":
+            axis_tensor = self.inputs[0]
+            inputs = self.inputs[1:]
+        elif self.type == "QuantizedConcat":
+            axis_tensor = self.inputs[0]
+            inputs = self.inputs[1:]
+            inputs = inputs[: len(inputs) // 3]  # Skip min/max
+
+        if self.type == "ConcatTFLite":
+            inputs = self.inputs
+            axis = self.attrs["axis"]
+        elif self.type == "PackReshaped":
+            # Requires fixup_pack_input to be called before this point
+            inputs = self.inputs
+            axis = self.attrs["axis"]
+            assert len(self.inputs) == self.attrs["values_count"]
+        else:
+            assert len(axis_tensor.ops) == 1 and axis_tensor.ops[0].type == "Const"
+            axis = int(axis_tensor.values)
+
+        return inputs, axis
+
+    split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped"))
+
+    def is_split_op(self):
+        return self.type in Operation.split_ops
+
+    def get_split_inputs_axis(self):
+        assert self.is_split_op()
+
+        offset_start = None
+        offset_end = None
+        axis = None
+        if self.type == "Split":
+            # TODO: Extend split capabilities
+            # If num_or_size_splits is an integer, then value is split along dimension axis into num_split smaller
+            # tensors. This requires that num_split evenly divides value.shape[axis].
+            # If num_or_size_splits is a 1-D Tensor (or list), we call it size_splits and value is split into
+            # len(size_splits) elements. The shape of the i-th element has the same size as the value except along
+            # dimension axis where the size is size_splits[i].
+            num_splits = self.attrs.get("num_splits")
+            axis_tens = self.inputs[0]
+            assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == "Const"
+            axis = int(axis_tens.values)
+            input_tens = self.inputs[1]
+            outputs = self.outputs
+            assert num_splits == len(outputs)
+
+        elif self.type == "Slice":
+            input_tens, begin_tens, size_tens = self.inputs
+            outputs = self.outputs
+            offset_start = [0] * len(input_tens.shape)
+            offset_end = [0] * len(input_tens.shape)
+
+            for idx in range(len(begin_tens.values)):
+                # Check if the op should slice in dimension idx
+                if size_tens.values[idx] != input_tens.shape[idx]:
+                    offset_start[idx] = begin_tens.values[idx]
+                    offset_end[idx] = size_tens.values[idx] + offset_start[idx]
+
+        elif self.type == "StridedSlice":
+            input_tens, begin_tens, end_tens, strides_tens = self.inputs
+            outputs = self.outputs
+            out_tens = outputs[0]
+            offset_start = [0] * len(outputs[0].shape)
+            offset_end = [0] * len(outputs[0].shape)
+
+            # Extract masks
+            begin_mask = self.attrs["begin_mask"]
+            ellipsis_mask = self.attrs["ellipsis_mask"]
+            end_mask = self.attrs["end_mask"]
+            new_axis_mask = self.attrs["new_axis_mask"]
+            shrink_axis_mask = self.attrs["shrink_axis_mask"]
+            # TODO: Either extend this to support these different masks or check
+            # for this at an earlier stage and place the op on Cpu if needed
+            assert begin_mask == end_mask
+            assert new_axis_mask == ellipsis_mask == 0
+            # shrink_axis_mask is not supported by the Operation class but the operation
+            # may have the attribute modified and handled in the graph optimization phase.
+            assert shrink_axis_mask == 0
+            assert len(input_tens.shape) == len(out_tens.shape)
+
+            for idx in range(len(input_tens.shape)):
+                # If the i:th bit in begin_mask is set then the value on begin[i] should be ignored
+                if (begin_mask & (1 << idx)) == 0:
+                    # Check if the op should slice in dimension idx
+                    if end_tens.values[idx] != input_tens.shape[idx] or (
+                        end_tens.values[idx] == input_tens.shape[idx] and begin_tens.values[idx] != 0
+                    ):
+                        offset_start[idx] = begin_tens.values[idx]
+                        offset_end[idx] = end_tens.values[idx]
+
+                else:
+                    # Don't slice in this axis, instead use fullest possible range
+                    continue
+
+        elif self.type == "UnpackReshaped":
+            # Requires fixup_unpack_output to be called before this point
+            input_tens = self.inputs[0]
+            outputs = self.outputs
+            axis = self.attrs["axis"]
+            num_splits = self.attrs["num"]
+            # Number of outputs have to equal the value of the dimension to unpack
+            assert num_splits == len(outputs) == input_tens.shape[axis]
+        else:
+            assert False
+
+        return input_tens, outputs, axis, offset_start, offset_end
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
new file mode 100644
index 0000000..663520f
--- /dev/null
+++ b/ethosu/vela/pass_packing.py
@@ -0,0 +1,489 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
+
+from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor
+import collections
+import enum
+from .data_type import BaseType, DataType
+
+
+class PassFlags(enum.Flag):
+    Empty = 0
+    Pre = 1
+    Main = 2
+    Post = 4
+    Mac = 8
+    Dma = 32
+    ElementWise = 256
+    Npu = 512
+    Cpu = 1024
+    StartupInit = 2048
+    MemoryOnly = 4096
+    PostFusingLimited = 8192
+
+
+npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
+
+mac_main_ops = set(
+    (
+        # convolutions
+        "Conv2DBiasAct",
+        "Conv2D",
+        "QuantizedConv2D",
+        "Conv2DBackpropInputSwitched",
+        # depth-wise convolutions
+        "DepthwiseConv2dBiasAct",
+        "DepthwiseConv2dNative",
+        "QuantizedDepthwiseConv2D",
+        # FC layers
+        "QuantizedMatMul",
+        "MatMul",
+        "FullyConnectedAct",
+        # RNN/LSTM/GRU
+        "BlockLSTM",
+        # pooling
+        "QuantizedMaxPool",
+        "QuantizedAvgPool",
+        "AvgPool",
+        "MaxPool",
+        "AvgPoolAct",
+        "MaxPoolAct",
+    )
+)
+
+binary_elem_wise_main_ops = set(
+    (
+        # binary element-wise
+        "AddAct",
+        "MulAct",
+        "SubAct",
+        "QuantizedAdd",
+        "QuantizedSub",
+        "QuantizedMul",
+        "Mul",
+        "Add",
+        "Sub",
+        "Minimum",
+        "Maximum",
+    )
+)
+
+unary_elem_wise_main_ops = set(("LeakyRelu", "Abs"))  # Unary element-wise operations
+
+elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+
+activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
+npu_post_ops = activation_ops | set(
+    # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
+    ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
+)
+
+npu_post_fuse_limited_ops = set(
+    # Set of post operators that should not be fused with main/elementwise ops
+    ("ConcatSliceWrite", "Sigmoid", "Tanh")
+)
+
+elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))
+
+
+quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
+cpu_ops = (
+    set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN"))
+    | quantization_ops
+)
+
+npu_dma_ops = set(("DMA",))
+startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
+memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
+
+
+test_sequence = [
+    (
+        # ops_set
+        npu_post_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.Post,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        npu_post_fuse_limited_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.PostFusingLimited,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        mac_main_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu
+        | PassFlags.MemoryOnly
+        | PassFlags.ElementWise
+        | PassFlags.Pre
+        | PassFlags.Main
+        | PassFlags.PostFusingLimited,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.Mac | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        elem_wise_main_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu
+        | PassFlags.MemoryOnly
+        | PassFlags.Mac
+        | PassFlags.Pre
+        | PassFlags.Main
+        | PassFlags.PostFusingLimited,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        npu_pre_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu | PassFlags.MemoryOnly,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        npu_dma_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu | PassFlags.MemoryOnly,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.Dma,
+        # flags_to_clear
+        PassFlags.Empty
+    ),
+    (
+        # ops_set
+        startup_init_ops,
+        # incompatible_pack_flags
+        PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,
+        # flags_to_set
+        PassFlags.StartupInit | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
+        memory_only_ops,
+        # incompatible_pack_flags
+        PassFlags.Npu | PassFlags.Cpu,
+        # flags_to_set
+        PassFlags.MemoryOnly | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty
+    ),
+    (
+        # ops_set
+        cpu_ops,
+        # incompatible_pack_flags
+        PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
+        # flags_to_set
+        PassFlags.Cpu | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty
+    ),
+    (   # This last one is a fallback for unrecognised operations
+        # ops_set
+        None,
+        # incompatible_pack_flags
+        PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
+        # flags_to_set
+        PassFlags.Cpu | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty
+    ),
+]
+
+# Some sanity checking
+for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
+    assert not flags_to_clear & flags_to_set
+
+    if operation_set is not None:
+        for op in operation_set:
+            assert len(op) > 1  # This is to avoid string literals being decomposed
+
+
+def pack_into_passes(nng, arch, verbose_packing=False):
+    def visit_op(op, ignored):
+        visit_op_refcount[op] += 1
+
+        if visit_op_refcount[op] == 1:  # First-time visit, go and fix up unused output tensors
+            for tens in op.outputs:
+                if len(tens.consumers()) == 0:
+                    visit_op_refcount[op] += 1
+
+        assert visit_op_refcount[op] <= len(op.outputs)
+        if visit_op_refcount[op] == len(op.outputs):
+
+            if op.type in startup_init_ops:
+                startup_list.append(op)
+            else:
+                _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+                if ofm_tensor is None:
+                    ofm_tensor = op.outputs[0]
+                build_pass((op,), ofm_tensor)
+
+    def build_pass(start_ops_to_process, ofm_tensor=None):
+        reverse_ops_list = []
+        curr_flags = PassFlags.Empty
+        npu_block_type = NpuBlockType.Default
+
+        reverse_intermediates = []
+        input_set = set()
+        ifm_tensor = None
+        primary_op = None
+
+        to_process = collections.deque()
+        for start_op in start_ops_to_process:
+            to_process.append((start_op, None))
+
+        while to_process:
+            curr_op, tens = to_process.popleft()
+
+            if curr_op in reverse_ops_list:
+                continue
+
+            for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
+                if operation_set is None or curr_op.type in operation_set:
+                    if not (curr_flags & incompatible_pack_flags):
+                        if flags_to_set & PassFlags.Npu:
+                            if not curr_op.run_on_npu:
+                                continue
+
+                        reverse_ops_list.append(curr_op)
+                        new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
+                        if new_block_type != NpuBlockType.Default:
+                            assert npu_block_type == NpuBlockType.Default
+                            npu_block_type = new_block_type  # Only one major block type per pass
+                            assert primary_op is None
+                            primary_op = curr_op
+
+                        curr_flags &= ~flags_to_clear
+                        curr_flags |= flags_to_set
+
+                        if flags_to_set & PassFlags.Npu:
+                            if flags_to_set & (
+                                PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
+                            ):
+                                assert len(curr_op.inputs) >= 1
+                                if curr_op.type == "BlockLSTM":
+                                    ifm_tensor = curr_op.inputs[3]
+                                else:
+                                    ifm_tensor = curr_op.inputs[0]
+                                assert ifm_tensor.purpose == TensorPurpose.FeatureMap
+
+                        if flags_to_set & PassFlags.Dma:
+                            # DMAs are special - Output buffers need to be preserved as intermediates,
+                            # if the pass consumes the results
+                            if tens is not None:
+                                reverse_intermediates.append(tens)
+
+                        if operation_set is None:
+                            print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
+
+                        for inp in curr_op.inputs:
+                            can_pack = True
+                            if len(inp.ops) == 1:
+                                next_op = inp.ops[0]
+                                for outp in next_op.outputs:
+                                    consumers = outp.consumers()
+                                    if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
+                                        can_pack = False
+                                        break
+                            else:
+                                can_pack = False
+
+                            if can_pack:
+                                to_process.append((next_op, inp))
+                            else:
+                                assert inp is not None
+                                input_set.add(inp)
+
+                        break
+
+            else:
+                # This operation is not compatible with already packed operations, just register the tensor as an input
+                assert tens is not None
+                input_set.add(tens)
+
+        if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac):
+            # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
+            # element wise unit
+            curr_flags |= PassFlags.ElementWise
+
+        is_element_wise = True
+        for op in reverse_ops_list:
+            if not op.type in elem_wise_ops and not op.type in npu_dma_ops:
+                is_element_wise = False
+                break
+
+        placement = PassPlacement.Unknown
+        if curr_flags & PassFlags.Npu:
+            assert placement == PassPlacement.Unknown
+            placement = PassPlacement.Npu
+        if curr_flags & PassFlags.Cpu:
+            assert placement == PassPlacement.Unknown
+            placement = PassPlacement.Cpu
+        if curr_flags & PassFlags.MemoryOnly:
+            assert placement == PassPlacement.Unknown
+            placement = PassPlacement.MemoryOnly
+        if curr_flags & PassFlags.StartupInit:
+            assert placement == PassPlacement.Unknown
+            placement = PassPlacement.StartupInit
+        assert placement != PassPlacement.Unknown
+
+        ops_list = list(reversed(reverse_ops_list))
+        intermediates = list(reversed(reverse_intermediates))
+
+        if primary_op == None:
+            primary_op = create_primary_op(ops_list)
+            if primary_op != None:
+                visit_tensor_refcount[primary_op.inputs[0]] += 1
+                npu_block_type = primary_op.attrs["npu_block_type"]
+                for input_tens in primary_op.inputs:
+                    if input_tens not in input_set:
+                        input_set.add(input_tens)
+
+        ordered_input_list = []
+        input_refcounts = collections.defaultdict(int)
+        for op in ops_list:
+            for inp in op.inputs:
+                if inp in input_set:
+                    if input_refcounts[inp] == 0:
+                        ordered_input_list.append(inp)
+                    input_refcounts[inp] += 1
+
+        name = ops_list[0].name
+        non_dma_ops = [op for op in ops_list if op.type != "DMA"]
+        if non_dma_ops:
+            name = non_dma_ops[0].name
+        ps = Pass(name, placement, is_element_wise, npu_block_type)
+        ps.ops = ops_list
+        ps.primary_op = primary_op
+        ps.inputs = ordered_input_list
+        ps.intermediates = intermediates
+        ps.outputs = list(ops_list[-1].outputs)
+        ps.ifm_tensor = ifm_tensor
+
+        # ElementWise operation, 2 IFMs
+        if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
+            ps.ifm_tensor = ps.inputs[0]
+
+            if len(ps.inputs) == 1:
+                # Only 1 input, IFM and IFM2 are the same tensor
+                ps.ifm2_tensor = ps.inputs[0]
+            else:
+                ps.ifm2_tensor = ps.inputs[1]
+        else:
+            ps.ifm_tensor = ifm_tensor
+            ps.ifm2_tensor = None
+
+        ps.ofm_tensor = ofm_tensor
+        assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
+        ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
+        ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]
+
+        for op in ps.ops:
+            op.scheduled_pass = ps
+
+        reverse_pass_list.append(ps)
+
+        for inp, refcount in input_refcounts.items():
+            for _ in range(refcount):
+                visit_tensor(inp)
+
+        return ps
+
+    def visit_tensor(tens):
+        visit_tensor_refcount[tens] += 1
+        assert visit_tensor_refcount[tens] <= len(tens.consumers())
+        if visit_tensor_refcount[tens] == len(tens.consumers()):
+            for op in reversed(tens.ops):
+                visit_op(op, tens)
+
+    def create_primary_op(ops_list):
+        if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) for op in ops_list):
+            # Configure a 1x1 AvgPool and attach the op onto it
+            op = ops_list[0]
+            inp = op.inputs[0]
+            avgpool_name = op.name + "_avgpool"
+            avgpool_op = Operation("AvgPool", avgpool_name)
+            avgpool_op.inputs = [inp]
+            avgpool_op.inputs[0].consumer_list.append(avgpool_op)
+            avgpool_op.attrs["padding"] = b"VALID"
+            avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
+            avgpool_op.attrs["stride_w"] = 1
+            avgpool_op.attrs["stride_h"] = 1
+            avgpool_op.attrs["filter_width"] = 1
+            avgpool_op.attrs["filter_height"] = 1
+            avgpool_op.attrs["strides"] = [1, 1, 1, 1]
+            avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
+            avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
+            avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
+            avgpool_out = inp.clone("_avgpooled")
+            avgpool_out.consumer_list.append(op)
+            avgpool_out.ops = [avgpool_op]
+            avgpool_op.outputs = [avgpool_out]
+
+            op.inputs[0] = avgpool_out
+            ops_list.insert(0, avgpool_op)
+
+            return avgpool_op
+
+        return None
+
+    for sg in nng.subgraphs:
+        reverse_pass_list = []
+        visit_op_refcount = collections.defaultdict(int)
+        visit_tensor_refcount = collections.defaultdict(int)
+
+        startup_list = []
+
+        for tens in sg.output_tensors:
+            visit_tensor(tens)
+
+        if startup_list:
+            startup_ps = build_pass(startup_list)
+            startup_ps.outputs = [op.outputs[0] for op in startup_list]  # Need to fixup the outputs
+            startup_ps.name = "startup_weight_initialisation"
+
+        sg.passes = list(reversed(reverse_pass_list))
+        sg.build_pass_links()
+
+    if verbose_packing:
+        nng.print_passes()
+
+    return nng
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
new file mode 100644
index 0000000..64de970
--- /dev/null
+++ b/ethosu/vela/range_set.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Helper classes to track memory accesses for calculating dependencies between Commands.
+
+from enum import IntEnum
+from collections import defaultdict
+from functools import lru_cache
+
+
+class RangeSet:
+    """A Range set class to track ranges and whether they intersect.
+Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
+
+    def __init__(self, start=None, end=None, ranges=None):
+        if ranges is None:
+            ranges = []
+
+        self.ranges = ranges  # track a list of (start, end) tuples, always in ascending order sorted by start.
+
+        if start is not None and start != end:
+            assert start < end
+            self.ranges.append((start, end))
+
+    def __or__(self, other):
+        combined_ranges = list(sorted(self.ranges + other.ranges))
+        return RangeSet(ranges=combined_ranges)
+
+    def __ior__(self, other):
+        self.ranges = list(sorted(self.ranges + other.ranges))
+        return self
+
+    def intersects(self, other):
+        a_ranges = self.ranges
+        b_ranges = other.ranges
+
+        a_idx = 0
+        b_idx = 0
+
+        while a_idx < len(a_ranges) and b_idx < len(b_ranges):
+            ar = a_ranges[a_idx]
+            br = b_ranges[b_idx]
+            if max(ar[0], br[0]) < min(ar[1], br[1]):
+                return True  # intersection
+
+            # advance one of the two upwards
+            if ar[0] < br[0]:
+                a_idx += 1
+            else:
+                assert ar[0] != br[0]
+                # note ar[0] == br[0] cannot happen, then we'd have an intersection
+                b_idx += 1
+
+        return False
+
+    def __str__(self):
+        return "<RangeSet %s>" % (["%#x:%#x" % (int(start), int(end)) for start, end in self.ranges],)
+
+    __repr__ = __str__
+
+
+class MemoryRangeSet:
+    """Extended version of the RangeSet class that handles having different memory areas"""
+
+    def __init__(self, mem_area=None, start=None, end=None, regions=None):
+
+        if regions is None:
+            regions = {}
+        self.regions = regions
+
+        if mem_area is not None:
+            self.regions[mem_area] = RangeSet(start, end)
+
+    def __or__(self, other):
+        combined_regions = {
+            mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet()))
+            for mem_area in (self.regions.keys() | other.regions.keys())
+        }
+        return MemoryRangeSet(regions=combined_regions)
+
+    def __ior__(self, other):
+        self.regions = {
+            mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet()))
+            for mem_area in (self.regions.keys() | other.regions.keys())
+        }
+        return self
+
+    def intersects(self, other):
+        for mem_area in self.regions.keys() & other.regions.keys():
+            if self.regions[mem_area].intersects(other.regions[mem_area]):
+                return True
+        return False
+
+    def __str__(self):
+        s = "<MemoryRangeSet>"
+        for mem_area, rng in self.regions.items():
+            s += "%s: %s\t" % (mem_area, rng)
+        return s
+
+    __repr__ = __str__
+
+
+class AccessDirection(IntEnum):
+    Read = 0
+    Write = 1
+    Size = 2
+
+
+class MemoryAccessSet:
+    """Tracks memory ranges, but also access patterns to know which accesses actually are in conflict"""
+
+    def __init__(self):
+        self.accesses = [MemoryRangeSet() for i in range(AccessDirection.Size)]
+
+    def add(self, memory_range_set, access):
+        self.accesses[access] |= memory_range_set
+
+    @lru_cache(maxsize=None)
+    def conflicts(self, other):
+
+        # True dependencies, or write -> read
+        if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Read]):
+            return True
+
+        # Anti-dependencies, or read -> write
+        if self.accesses[AccessDirection.Read].intersects(other.accesses[AccessDirection.Write]):
+            return True
+
+        # Output dependencies, or write -> write
+        if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Write]):
+            return True
+
+        # read -> read does not cause a conflict
+        return False
+
+    def __str__(self):
+        return "Read: %s\nWrite: %s\n\n" % (self.accesses[AccessDirection.Read], self.accesses[AccessDirection.Write])
+
+    __repr__ = __str__
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
new file mode 100644
index 0000000..5563b96
--- /dev/null
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -0,0 +1,945 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
+# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
+# stream suitable for interpretation by the Ethos-U55 processor.
+
+from collections import defaultdict
+from enum import Enum, IntEnum
+from .high_level_command_stream import CommandType
+from .ethos_u55_regs.ethos_u55_regs import *
+from .tensor import MemArea, TensorBlockTraversal
+from .operation import NpuBlockType
+from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh
+from .data_type import BaseType
+import numpy as np
+from .shared_buffer_allocation import SharedBufferAllocation
+from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures
+from .nn_graph import TensorFormat, SchedulingStrategy
+from .range_set import (
+    MemoryAccessSet,
+    AccessDirection,
+)
+from .mark_tensors import (
+    reshape_operations,
+)
+from .architecture_features import Block, Kernel, Rect
+from . import scaling
+
+
+class RegisterMachine:
+    def __init__(self):
+        self.n_banks = 1
+        self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
+        self.bank_idx = 0
+
+    def set_register(self, reg, value):
+        is_changed = self.registers[self.bank_idx][reg] != value
+        self.registers[self.bank_idx][reg] = value
+        # is_changed = True # force command
+        return is_changed
+
+    def switch_bank(self):
+        self.bank_idx = (self.bank_idx + 1) % self.n_banks
+
+
+class CmdMode(IntEnum):
+    NoPayload = 0x0000
+    Payload32 = 0x4000
+    Mask = 0xC000
+    CmdOpMask = 0x03FF
+
+
+class BasePointerIndex(IntEnum):
+    ReadOnly = 0  # base address slot index for weights and scaling
+    Scratch = 1  # base address slot index for scratch memory area
+
+
+# TODO: Replace with definitions from ethos_u55_regs
+class IFM2Broadcast(IntEnum):
+    BroadcastHdim = 1 << 0
+    BroadcastWdim = 1 << 1
+    BroadcastCdim = 1 << 2
+    ReverseOperandOrder = 1 << 6
+    UseIFM2Scalar = 1 << 7
+
+
+class CommandStreamEmitter:
+    def __init__(self):
+        self.cmd_stream = []
+        self.reg_machine = [RegisterMachine(), RegisterMachine()]
+        self.last_absolute_wait = defaultdict(int)
+
+    def get_reg_machine(self, cmd):
+        if "DMA" in cmd.name:
+            return self.reg_machine[1]
+        else:
+            return self.reg_machine[0]
+
+    def size_in_bytes(self):
+        sz = 0
+        for cmd in self.cmd_stream:
+            sz += len(cmd) * 4
+        return sz
+
+    def to_list(self):
+        return [elem for cmd in self.cmd_stream for elem in cmd]
+
+    def print_cmds(self):
+        print("Code:    Command:                       Param: Payload:")
+        for words_for_one_command in self.cmd_stream:
+            code = words_for_one_command[0] & 0x0000FFFF  # lower 16 bits
+            param = words_for_one_command[0] >> 16  # higher 16 bits
+
+            payload_mode = CmdMode(code & CmdMode.Mask)
+
+            # code and command
+            s = "  0x%04x " % code
+            if payload_mode == CmdMode.NoPayload:
+                s += str(cmd0(code & CmdMode.CmdOpMask))
+            else:
+                s += str(cmd1(code & CmdMode.CmdOpMask))
+
+            s = s.ljust(40)
+            s += "%5d" % param
+
+            # payload
+            if payload_mode == CmdMode.Payload32:
+                s += "   0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
+            else:
+                s += "   -"
+
+            print(s)
+
+    def cmd0_with_param(self, cmd, param):
+        if isinstance(param, Enum):
+            param = int(param.value)
+        else:
+            param = int(param)
+        param = param & 0xFFFF
+        command = cmd.value | (param << 16)
+        if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
+            return
+
+        # This is not a redundant command, actually write it
+        self.cmd_stream.append((command,))
+
+    def cmd1_with_offset(self, cmd, offset, param=0x0):
+        offset = int(offset) & 0xFFFFFFFFF
+        command = cmd.value | CmdMode.Payload32.value | (param << 16)
+
+        if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
+            return
+
+        # This is not a redundant command, actually write it
+        self.cmd_stream.append((command, offset))
+
+    def cmd_wait(self, cmd, param, absolute_wait_time):
+        if absolute_wait_time <= self.last_absolute_wait[cmd]:
+            return
+
+        self.last_absolute_wait[cmd] = absolute_wait_time
+        param = int(param)
+        command = ((param & 0xFFFF) << 16) | cmd.value
+        self.cmd_stream.append((command,))
+
+    def cmd_do_operation(self, cmd, param=0):
+        param = int(param)
+        command = ((param & 0xFFFF) << 16) | cmd.value
+
+        self.cmd_stream.append((command,))
+        self.get_reg_machine(cmd).switch_bank()
+
+
+def calc_command_dependencies(cmd_stream, arch):
+    cmd_starts = {}
+    cmd_ends = {}
+    memory_accesses = {}
+
+    # Keep track of accumulated number of commands in command stream.
+    # First element kernel ops: (# of blocks, # of commands)
+    # Second element DMA ops: (# of commands)
+    pos = np.array((np.array((0, 0)), np.array([0])))
+
+    dependencies = {}
+
+    for cmd in cmd_stream:
+        cmd_starts[cmd] = pos
+        op_count = cmd.get_operation_count()
+        # Keep track of both num blocks and commands
+        cmd_add = 0 if (op_count[0] == 0) else 1
+        pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))
+        cmd_ends[cmd] = np.array((pos[0], pos[1]))
+        memory_accesses[cmd] = cmd.get_memory_accesses()
+
+    for idx, cmd in enumerate(cmd_stream):
+        curr_accesses = memory_accesses[cmd]
+        # Keep track of command dependency.
+        # First element kernel ops: (# of blocks, # of commands)
+        # Second element DMA ops: (# of commands)
+        dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))
+        dep_cmds = [None] * CommandType.Size.value
+        if idx > 0:
+            # Look at the previous commands in backwards order
+            for prev_cmd in cmd_stream[idx - 1 :: -1]:
+                assert prev_cmd is not cmd
+                if dep_cmds[prev_cmd.cmdtype] is None:
+                    is_dependency = False
+                    if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
+                        # Special handling here, as dpu -> dpu operations require additional care
+                        if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
+                            is_dependency = True
+                        elif memory_accesses[prev_cmd].conflicts(curr_accesses):
+                            is_dependency = True
+                    else:
+                        if memory_accesses[prev_cmd].conflicts(curr_accesses):
+                            is_dependency = True
+
+                    if is_dependency:
+                        new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
+                        if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
+                            dep_cmds[prev_cmd.cmdtype] = prev_cmd
+                            dep_offsets[prev_cmd.cmdtype] = new_offset
+
+                        # Check if we've got dependencies for all commands, in which case we can early out
+                        for dep in dep_cmds:
+                            if dep is None:
+                                break
+                        else:
+                            break  # all handled
+
+        # Convert absolute to relative dependencies, using None to signal the special case of no
+        # dependency of this kind
+        res = [None] * CommandType.Size.value
+        for i in range(CommandType.Size.value):
+            if dep_cmds[i] is not None:
+                res[i] = cmd_starts[cmd][i] - dep_offsets[i]
+
+        dependencies[cmd] = cmd_starts[cmd], res
+
+    return dependencies
+
+
+def get_op_kernel(ps):
+    if ps.primary_op is None:
+        return None
+
+    strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
+    dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
+    if ps.weight_tensor:
+        if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
+            k_h = 1
+            k_w = 1
+        else:
+            k_h = ps.weight_tensor.shape[0]
+            k_w = ps.weight_tensor.shape[1]
+    else:
+        k_h = ps.primary_op.attrs.get("filter_height", 1)
+        k_w = ps.primary_op.attrs.get("filter_width", 1)
+
+    return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+
+
+def full_shape(shape, fill):
+    return ([fill] * (4 - len(shape))) + shape
+
+
+def has_prev_op_dependency(prev_cmd, cmd):
+    if prev_cmd is None:
+        return False
+    if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
+        if prev_cmd.ofm_tensor == cmd.ifm_tensor:
+            return True
+        else:
+            return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id
+    return False
+
+
+def get_op_ofm_rect(cmd):
+    start = full_shape(cmd.ofm_box.start_coord, 0)
+    end = full_shape(cmd.ofm_box.end_coord, 1)
+    return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+
+
+def get_op_ifm_rect(cmd):
+    start = full_shape(cmd.ifm_box.start_coord, 0)
+    end = full_shape(cmd.ifm_box.end_coord, 1)
+    return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+
+
+def get_op_ifmofm_block_depth(arch, cmd):
+    # Note: NOT equivalent to the normal ifm block depth calculation since
+    # it takes into account 'depthless' block operations by returning full
+    # depth
+    if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):
+        return cmd.ofm_box.get_size_shape()[-1]
+
+    return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
+
+
+def get_op_padding_lt(cmd):
+    if cmd.ps.npu_block_type not in (
+        NpuBlockType.ConvolutionDepthWise,
+        NpuBlockType.Pooling,
+        NpuBlockType.ConvolutionMxN,
+    ):
+        return (0, 0)
+
+    explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)
+
+    # Check if this is for horizontal ifm streaming
+    if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+        explicit_padding[0] = cmd.pad_top
+        explicit_padding[2] = cmd.pad_bottom
+
+    return (explicit_padding[1], explicit_padding[0])
+
+
+def generate_register_command_stream(nng, sg, arch, verbose=False):
+    emit = CommandStreamEmitter()
+
+    base_ptr_idx_map = {
+        MemArea.Sram: BasePointerIndex.Scratch,
+        MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
+        MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
+        MemArea.Dram: BasePointerIndex.ReadOnly,
+    }
+
+    # Maps an AccumulatorType enum to the corresponding acc_format value
+    acc_format_map = {
+        SHRAMElements.Acc16: acc_format.FP_S5_10.value,
+        SHRAMElements.Acc32: acc_format.INT_32BIT.value,
+        SHRAMElements.Acc40: acc_format.INT_40BIT.value,
+    }
+
+    # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
+    elementwise_mode_map = {
+        "MulAct": elementwise_mode.MUL.value,
+        "AddAct": elementwise_mode.ADD.value,
+        "SubAct": elementwise_mode.SUB.value,
+        "Minimum": elementwise_mode.MIN.value,
+        "Maximum": elementwise_mode.MAX.value,
+        "LeakyRelu": elementwise_mode.LRELU.value,
+        "Abs": elementwise_mode.ABS.value,
+    }
+
+    cmd_stream = []
+    for cmd in sg.high_level_command_stream:
+        if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
+            print("Warning: Skipping register command stream generation for", cmd.ps)
+        else:
+            cmd_stream.append(cmd)
+
+    dependencies = calc_command_dependencies(cmd_stream, arch)
+
+    # Initialise operator dependency state
+    prev_ifm_rect = cur_ifm_rect = None
+    prev_ifm_block_depth = cur_ifm_block_depth = None
+    prev_ofm_rect = cur_ofm_rect = None
+    prev_ofm_block = cur_ofm_block = None
+    prev_kernel = cur_kernel = None
+    prev_cmd = None
+
+    def emit_wait_commands(cmd):
+        # The command is fully set up, emit whatever wait commands we need
+        absolute_dep, relative_dep = dependencies[cmd]
+        if relative_dep[CommandType.NpuStripe] is not None:
+            if cmd.cmdtype == CommandType.DMA:
+                param = relative_dep[CommandType.NpuStripe][1]
+                if param <= 3:
+                    emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
+            else:
+                param = relative_dep[CommandType.NpuStripe][0]
+                param = min(param, 0xFFFF)  # Clamp to allowable wait amount
+
+        if relative_dep[CommandType.DMA] is not None:
+            param = relative_dep[CommandType.DMA][0]
+            param = min(param, 0xF)  # Clamp to allowable wait amount
+            emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
+            prev_cmd = None  # Clear any dependency
+
+    # Start by issuing REGION commands since they remain the same
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, BasePointerIndex.Scratch)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, BasePointerIndex.Scratch)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, BasePointerIndex.Scratch)
+    for cmd in cmd_stream:
+        if cmd.cmdtype == CommandType.DMA:
+            start_coord = cmd.box.start_coord
+
+            src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
+            dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+
+            if cmd.in_tensor.compressed_values is not None:
+                stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+                sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+            else:
+                sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
+
+            # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
+            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
+            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
+            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
+            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
+            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
+            dma_channel = 0
+            mode = 0  # From external to external
+
+            emit_wait_commands(cmd)
+            emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
+
+        elif cmd.cmdtype == CommandType.NpuStripe:
+
+            ps = cmd.ps
+            primary_op = ps.primary_op
+            npu_block_type = ps.npu_block_type
+            # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
+            use_global_scale = False
+            # Specifies type of rounding to be used.
+            rounding_mode = rounding.TFL
+            fmf = primary_op.attrs.get("fused_memory_function", None)
+            faf = primary_op.attrs.get("fused_activation_function", None)
+
+            # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
+            op_to_scale = 0
+
+            # Update state history
+            prev_ifm_rect = cur_ifm_rect
+            prev_ifm_block_depth = cur_ifm_block_depth
+            prev_ofm_rect = cur_ofm_rect
+            prev_ofm_block = cur_ofm_block
+            prev_kernel = cur_kernel
+
+            block_config = ps.block_config
+            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
+            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
+            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
+
+            shared_buffer = ps.shared_buffer
+
+            if npu_block_type == NpuBlockType.ElementWise:
+                ifm2_broadcast = 0
+
+                if cmd.ifm_tensor.shape == []:
+                    # The scalar has to be the ifm2 tensor so switch the ifms
+                    cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
+                    cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
+
+                    # Set ReverseOperandOrder bit to IFM2_BROADCAST
+                    ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
+
+                # Calculate scales needed for arithmetic elementwise operators
+                if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
+                    input_scale = cmd.ifm_tensor.quantization.scale_f32
+                    input2_scale = cmd.ifm2_tensor.quantization.scale_f32
+                    output_scale = cmd.ofm_tensor.quantization.scale_f32
+                    use_global_scale = True
+
+                    if primary_op.type == "MulAct":
+                        if (faf == "Sigmoid") or (faf == "Tanh"):
+                            output_scale = 1 / 0x3000
+
+                        ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
+                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+                    else:  # AddAct/SubAct
+                        if (faf == "Sigmoid") or (faf == "Tanh"):
+                            output_scale = 1 / 0x3000
+
+                        if input_scale == input2_scale:
+                            opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
+                                input_scale, input2_scale, output_scale
+                            )
+                            opa_shift = 0  # Unused for this case
+                        else:
+                            # Use advanced implementation only when input scales differ
+                            bitdepth = cmd.ifm_tensor.dtype.bits
+                            (
+                                opa_scale,
+                                opa_shift,
+                                ofm_scale,
+                                shift,
+                                op_to_scale,
+                            ) = scaling.advanced_elementwise_add_sub_scale(
+                                input_scale, input2_scale, output_scale, bitdepth
+                            )
+                            opb_scale = 0  # Unused for this case
+                            if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
+                                # If the operand order is reversed we also have to swap which operand is scaled
+                                if op_to_scale == scaling.OperandToScale.OPa:
+                                    op_to_scale = scaling.OperandToScale.OPb
+                                else:
+                                    op_to_scale = scaling.OperandToScale.OPa
+
+                        emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
+                        emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
+                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+
+                if primary_op.type in set(("LeakyRelu", "Abs",)):
+                    output_scale = cmd.ofm_tensor.quantization.scale_f32
+                    use_global_scale = True
+
+                    if primary_op.type == "LeakyRelu":
+                        output_scale *= primary_op.attrs["alpha"]
+
+                    ofm_scale, shift = scaling.quantise_scale(output_scale)
+                    emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+
+                # For elementwise set the required SHRAM to be equal to the total size of SHRAM
+                shram_required = arch.shram_total_banks
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
+
+                # Acc buffers not needed so set AB_START to size of SHRAM
+                emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
+
+                # Is not a unary operator
+                if cmd.ifm2_tensor is not None:
+                    if cmd.ifm2_tensor.shape == []:
+                        # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
+                        ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
+                    else:
+                        ifm_box_shape = cmd.ifm_box.get_size_shape()
+                        ifm2_box_shape = cmd.ifm2_box.get_size_shape()
+
+                        if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
+                            # Broadcast in 'H' dimension
+                            assert cmd.ifm2_tensor.shape[1] == 1
+                            ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
+
+                        if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
+                            # Broadcast in 'W' dimension
+                            assert cmd.ifm2_tensor.shape[2] == 1
+                            ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
+
+                        if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
+                            # Broadcast in 'C' dimension
+                            assert cmd.ifm2_tensor.shape[3] == 1
+                            ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
+
+                        # Set IFM2_IB_START to the latter half of the IB space
+                        ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
+                        emit.cmd0_with_param(
+                            cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
+                        )
+
+                    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
+
+            else:
+                emit.cmd0_with_param(
+                    cmd0.NPU_SET_IFM_IB_END,
+                    shared_buffer.bank_locations[SharedBufferArea.IFM]
+                    + shared_buffer.banks_required[SharedBufferArea.IFM],
+                )
+                emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
+
+            emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
+
+            emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 0)
+
+            if npu_block_type in set(
+                (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
+            ):
+                # Set up padding
+                explicit_padding = list(primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)
+
+                # Check if this is for horizontal ifm streaming
+                if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+                    explicit_padding[0] = cmd.pad_top
+                    explicit_padding[2] = cmd.pad_bottom
+
+                # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
+                # because of activation function needed to be fused.
+                if cmd.ifm_box.start_coord[-2] > 0:
+                    explicit_padding[1] = 0
+                if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
+                    explicit_padding[3] = 0
+
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
+
+                stride = primary_op.attrs["strides"][2] - 1
+                stride |= (primary_op.attrs["strides"][1] - 1) << 1
+
+                if npu_block_type == NpuBlockType.Pooling:
+                    k_height, k_width = primary_op.attrs["ksize"][1:3]
+                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
+                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
+
+                    valid_padding = sum(explicit_padding) == 0
+
+                    if primary_op.type in set(("AvgPool", "AvgPoolAct")) and valid_padding:
+                        # For valid padding vela has to output scaling values
+                        if faf == "Sigmoid" or faf == "Tanh":
+                            rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
+                            rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+
+                            scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
+                            scale = int(round_away_zero(scale * rescale))
+                        else:
+                            # In case avg pool fused with concat or other memory operation, rescaling might be needed.
+                            # k_height == k_width == 1 is allways true in this case
+                            # Normally the scale is maximised, to get maximum precision, which means that
+                            # if rescale != 1, scale need to consider the number of bits needed for rescaling
+                            rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
+                            rescale_bits = 0
+                            if k_height == k_width == 1:
+                                if fmf == "ConcatSliceWrite":
+                                    rounding_mode = rounding.NATURAL
+                                if rescale > 1:
+                                    rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+                                elif rescale < 1:
+                                    rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
+                            scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
+                            scale = int(round_away_zero(scale * rescale))
+
+                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
+                        # Valid-padded average pool should use the global scale from
+                        # NPU_SET_OFM_SCALE register, which is set above.
+                        use_global_scale = True
+
+                else:  # Convolution
+                    assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
+                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1)
+                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1)
+                    if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
+                        # Part-kernel-first weight ordering
+                        assert npu_block_type == NpuBlockType.ConvolutionMxN
+                        stride |= 1 << 2
+
+                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
+
+            elif npu_block_type in set((NpuBlockType.VectorProduct,)):
+                # Vector product is implemented using a 1x1 convolution so need
+                # to setup the appropriate padding and kernel info
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
+
+                # kernel stride reg = 0 means stride(1,1) + depth first weight
+                # order + dilation(0,0) + kernel_split_size=8
+                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
+
+                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
+                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
+
+            if npu_block_type in set(
+                (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
+            ):
+                # Emit Weight base address commands, only maps the area required for
+                # this command's weights from the larger tensor.
+                stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
+                weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
+                weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
+                # Select weight/scale region depending on where permanent storage was defined
+                weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
+                if arch.permanent_storage_mem_area == MemArea.Sram:
+                    weight_region = BasePointerIndex.ReadOnly
+                emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
+                emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
+                emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
+
+                # Emit Scale & Bias base address commands, with length matching the amount required by
+                # the weight tensors.
+                if cmd.scale_tensor is not None:
+                    # Get address and size of the scale/bias data area
+                    scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
+                    scale_len = (
+                        cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
+                    )
+                    # Emit base address for NPU to access scale & bias data
+                    scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
+                    if arch.permanent_storage_mem_area == MemArea.Sram:
+                        scale_region = BasePointerIndex.ReadOnly
+                    emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
+                    emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
+                    emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
+
+            ofm_quant = cmd.ofm_tensor.quantization
+            ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
+            ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
+            ifm_min = cmd.ifm_tensor.quantization.min
+            ifm_max = cmd.ifm_tensor.quantization.max
+
+            # Emit commands for any fused activation function
+            if faf == None:
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+                # Even if no activation function, values need to be set to override previous values
+                faf_min = ofm_quant_qmin
+                faf_max = ofm_quant_qmax
+            elif faf == "Relu":
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+                faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+                faf_max = ofm_quant_qmax
+            elif faf == "Relu6":
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+                faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+                faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+            elif faf == "ReluN1To1":
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+                faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+                faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+            elif faf == "Tanh":
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
+                faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
+                faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
+            elif faf == "Sigmoid":
+                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
+                faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
+                faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
+            else:
+                raise Exception("Unsupported fused_activation_function = " + faf)
+
+            # Activation range needs to be set based upon the quantisation range and the fused activation range
+            emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
+            emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
+
+            out_shape = cmd.ofm_box.get_size_shape()
+            if len(out_shape) >= 4:
+                emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
+            else:
+                emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
+            if len(out_shape) >= 2:
+                emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
+            else:
+                emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
+            emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
+
+            if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+                in_shape = cmd.ifm_box.get_size_shape()
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
+            else:
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
+
+            for tens, box, ptr_ops, stride_ops, zero_point_op in (
+                (
+                    cmd.ifm_tensor,
+                    cmd.ifm_box,
+                    (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
+                    (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
+                    cmd0.NPU_SET_IFM_ZERO_POINT,
+                ),
+                (
+                    cmd.ifm2_tensor,
+                    cmd.ifm2_box,
+                    (
+                        cmd1.NPU_SET_IFM2_BASE0,
+                        cmd1.NPU_SET_IFM2_BASE1,
+                        cmd1.NPU_SET_IFM2_BASE2,
+                        cmd1.NPU_SET_IFM2_BASE3,
+                    ),
+                    (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
+                    cmd0.NPU_SET_IFM2_ZERO_POINT,
+                ),
+                (
+                    cmd.ofm_tensor,
+                    cmd.ofm_box,
+                    (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
+                    (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
+                    cmd0.NPU_SET_OFM_ZERO_POINT,
+                ),
+            ):
+
+                if tens == None:
+                    continue
+
+                need_zero_point = (faf != None) or (fmf == "ConcatSliceWrite")
+                if (
+                    primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point
+                ) or tens.quantization == None:
+                    # Actual integer operation, just set scale to 1 and zero point to 0
+                    emit.cmd0_with_param(zero_point_op, 0)
+                else:
+                    assert tens.quantization.zero_point is not None, "need an actual zero point set"
+                    emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))
+
+                if tens.shape == []:
+                    # Empty shape, elementwise constant
+                    ifm2_scalar = tens.quant_values.astype(np.uint8)
+                    assert ifm2_scalar.size == 1
+                    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, ifm2_scalar.item(0))
+                    continue
+
+                height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
+                    box.start_coord, box.end_coord
+                )
+                if npu_block_type != NpuBlockType.VectorProduct:
+                    if tens == cmd.ifm_tensor:
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
+                    elif tens == cmd.ofm_tensor:
+                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
+                    elif tens == cmd.ifm2_tensor:
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
+                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
+                else:
+                    if len(out_shape) == 2:
+                        # TODO: N is put in W-dimension for now
+                        # Should be spread over H and W, but then block size selectetion,
+                        # and stride calculation should be changed
+                        if tens == cmd.ifm_tensor:
+                            emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
+                        elif tens == cmd.ofm_tensor:
+                            emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
+                    else:
+                        assert False
+
+                for idx, addr in enumerate(addresses):
+                    if addr is None:
+                        addresses[idx] = 0
+
+                emit.cmd1_with_offset(ptr_ops[0], addresses[0])
+                emit.cmd1_with_offset(ptr_ops[1], addresses[1])
+                emit.cmd1_with_offset(ptr_ops[2], addresses[2])
+                emit.cmd1_with_offset(ptr_ops[3], addresses[3])
+
+                strides = tens.get_strides()
+                emit.cmd1_with_offset(stride_ops[0], strides[1])  # stride between 16-byte channel blocks (C)
+                emit.cmd1_with_offset(stride_ops[2], strides[3])  # stride between horisontal values (W)
+                emit.cmd1_with_offset(stride_ops[1], strides[2])  # stride between vertical values (H)
+
+                if tens.format == TensorFormat.NHCWB16:
+                    # Check that all BasePointer addresses are aligned to 16 bytes
+                    assert (int(addresses[0]) % 16) == 0
+                    assert (int(addresses[1]) % 16) == 0
+                    assert (int(addresses[2]) % 16) == 0
+                    assert (int(addresses[3]) % 16) == 0
+
+            ofm_dtype = cmd.ofm_tensor.dtype
+            assert ofm_dtype.type & BaseType.Int
+            prec = 0
+            if ofm_dtype.size_in_bits() == 8:
+                prec = 0
+            elif ofm_dtype.size_in_bits() == 16:
+                prec = 2
+            else:
+                assert 0
+
+            if ofm_dtype.type & BaseType.Signed:
+                prec += 1
+
+            if use_global_scale:
+                # Set global scale bit, as opposed to using per channel scale
+                prec |= 1 << 8
+
+            if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
+                prec |= 1 << 6
+
+            prec |= rounding_mode.value << 14
+
+            emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
+
+            prec = None
+            weight_bits = 8
+            if cmd.weight_tensor is not None:
+                weight_bits = cmd.weight_tensor.dtype.size_in_bits()
+
+            ifm_dtype = cmd.ifm_tensor.dtype
+
+            assert weight_bits == 8, "Unsupported weight bit depth"
+            assert ifm_dtype.size_in_bits() in {8, 16}
+
+            if ifm_dtype.size_in_bits() == 8:
+                if ifm_dtype.type & BaseType.Signed:
+                    prec = ifm_precision.W8_S8
+                else:
+                    prec = ifm_precision.W8_U8
+            elif ifm_dtype.size_in_bits() == 16:
+                if ifm_dtype.type & BaseType.Signed:
+                    prec = ifm_precision.W8_S16
+                else:
+                    prec = ifm_precision.W8_U16
+
+            ifm_prec = prec.value
+            ifm2_prec = ifm_prec
+
+            if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
+                ifm_prec |= 1 << 6
+
+            ifm_prec |= op_to_scale << 8
+
+            emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
+
+            if cmd.ifm2_tensor is not None:
+                if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
+                    ifm2_prec |= 1 << 6
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
+
+            emit_wait_commands(cmd)
+
+            # Get op parameters
+            cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
+            cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
+            cur_ofm_rect = get_op_ofm_rect(cmd)
+            cur_ifm_rect = get_op_ifm_rect(cmd)
+            cur_kernel = get_op_kernel(cmd.ps)
+            cur_padLT = get_op_padding_lt(cmd)
+            if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
+                if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
+                    blockdep = arch.calc_block_dep(
+                        prev_ifm_rect,
+                        prev_ofm_rect,
+                        prev_ifm_block_depth,
+                        prev_ofm_block,
+                        prev_kernel,
+                        cur_ifm_rect,
+                        cur_ofm_rect,
+                        cur_ifm_block_depth,
+                        cur_ofm_block,
+                        cur_kernel,
+                        cur_padLT,
+                    )
+                else:
+                    blockdep = 0
+            else:
+                blockdep = ArchitectureFeatures.MAX_BLOCKDEP
+
+            # Set between every op (dependent or not)
+            blockdep = min(blockdep, arch.max_blockdep)
+            emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
+            prev_cmd = cmd
+
+            if npu_block_type == NpuBlockType.ConvolutionMxN:
+                emit.cmd_do_operation(cmd0.NPU_OP_CONV)
+            elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+                emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
+            elif npu_block_type == NpuBlockType.VectorProduct:
+                # Vector product is implemented using a 1x1 convolution
+                emit.cmd_do_operation(cmd0.NPU_OP_CONV)
+            elif npu_block_type == NpuBlockType.Pooling:
+                param = "Max" not in primary_op.type
+                emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
+            elif npu_block_type == NpuBlockType.ElementWise:
+                param = elementwise_mode_map[primary_op.type]
+                emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
+            else:
+                print("Warning: Skipping register command stream generation for", ps)
+
+    # Fill in final part of command stream:
+    emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
+
+    sg.register_command_stream = emit.to_list()
+    if verbose:
+        emit.print_cmds()
+        print("number of commands", len(emit.cmd_stream))
+        print("command stream length in words", len(sg.register_command_stream))
diff --git a/ethosu/vela/rewrite_graph.py b/ethosu/vela/rewrite_graph.py
new file mode 100644
index 0000000..e6e24e6
--- /dev/null
+++ b/ethosu/vela/rewrite_graph.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions for abstracting out the traversal and rewriting of graphs so that the optimisation passes can focus on the
+# correct operation.
+#
+# Requires two lists, one of functions that rewrite Tensors, and one of functions that rewrite Operations.
+#
+# Pre-order traversal, this supports rewrites. Therefore, functions can return something other than the original value.
+#
+# Post-order traversal, this does not support rewrites. Therefore, functions must return the original value.
+
+
+def rewrite_graph_pre_order(sg, arch, tensor_rewrite_list, op_rewrite_list, rewrite_unsupported=True):
+
+    op_visit_dict = dict()
+    tens_visit_dict = dict()
+
+    def visit_op(op):
+        if op in op_visit_dict:
+            return op_visit_dict[op]
+        res = op
+        prev_res = None
+        while prev_res != res:
+            prev_res = res
+            for rewrite in op_rewrite_list:
+                if res.run_on_npu or rewrite_unsupported:
+                    res = rewrite(res, arch)
+
+        op_visit_dict[op] = res
+        op_visit_dict[res] = res
+
+        inputs = res.inputs
+        res.inputs = []
+        for tens in inputs:
+            res.inputs.append(visit_tens(tens))
+
+        outputs = res.outputs
+        res.outputs = []
+        for tens in outputs:
+            res.outputs.append(visit_tens(tens))
+
+        return res
+
+    def visit_tens(tens):
+        if tens in tens_visit_dict:
+            return tens_visit_dict[tens]
+
+        res = tens
+        prev_res = None
+        while prev_res != res:
+            prev_res = res
+            for rewrite in tensor_rewrite_list:
+                res = rewrite(res, arch)
+
+        tens_visit_dict[tens] = res
+        tens_visit_dict[res] = res
+
+        ops = res.ops
+        res.ops = []
+        for op in ops:
+            res.ops.append(visit_op(op))
+        return res
+
+    sg.output_tensors = [visit_tens(tens) for tens in sg.output_tensors]
+    sg.refresh_after_modification()
+
+    return sg
+
+
+def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list):
+
+    op_visit_dict = dict()
+    tens_visit_dict = dict()
+
+    def visit_op(op):
+        if op in op_visit_dict:
+            return op_visit_dict[op]
+        op_visit_dict[op] = op
+
+        for tens in op.inputs:
+            visit_tens(tens)
+
+        for visit in op_visit_list:
+            visit(op, arch)
+
+        for tens in op.outputs:
+            visit_tens(tens)
+
+        return op
+
+    def visit_tens(tens):
+        if tens in tens_visit_dict:
+            return tens_visit_dict[tens]
+
+        tens_visit_dict[tens] = tens
+
+        for op in tens.ops:
+            visit_op(op)
+
+        for visit in tensor_visit_list:
+            visit(tens, arch)
+
+        return tens
+
+    for tens in sg.output_tensors:
+        visit_tens(tens)
+
+    sg.refresh_after_modification()
+
+    return sg
+
+
+def verify_graph_health(nng):
+
+    for sg in nng.subgraphs:
+        verify_subgraph_health(sg)
+
+    return True
+
+
+def verify_subgraph_health(sg):
+    op_visit_dict = dict()
+    tens_visit_dict = dict()
+
+    def visit_op(op):
+        if op in op_visit_dict:
+            return op_visit_dict[op]
+        op_visit_dict[op] = op
+
+        for tens in op.inputs:
+            assert op in tens.consumers()
+            visit_tens(tens)
+
+        for tens in op.outputs:
+            assert op in tens.ops
+            visit_tens(tens)
+
+        return op
+
+    def visit_tens(tens):
+        if tens in tens_visit_dict:
+            return tens_visit_dict[tens]
+
+        tens_visit_dict[tens] = tens
+
+        for op in tens.ops:
+            assert tens in op.outputs
+            visit_op(op)
+
+        return tens
+
+    for tens in sg.output_tensors:
+        visit_tens(tens)
+
+    return True
diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py
new file mode 100644
index 0000000..b255f93
--- /dev/null
+++ b/ethosu/vela/scaling.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains various scaling calculations for weights, elementwise operations, pooling etc.
+
+import math
+from .numeric_util import round_away_zero
+from enum import IntEnum
+
+
+class OperandToScale(IntEnum):
+    OPa = 1
+    OPb = 2
+
+
+# Quantise floating point scale value into 32-bit int scale and 6-bit shift
+def quantise_scale(scale):
+    significand, exponent = math.frexp(scale)
+    significand_q31 = int(round_away_zero(significand * (1 << 31)))
+    exponent_q31 = exponent - 31
+    shift = exponent_q31 * -1
+
+    if shift >= (1 << 6):
+        # Shift outside of valid range, set scale to 0
+        return 0, 16
+
+    return significand_q31, shift
+
+
+# Calculate global OFM scale for Average Pooling
+def quantise_pooling_scale(nr_kernel_elements, rescale_bits=0):
+    _, k = math.frexp(nr_kernel_elements - 1)
+    N = 31 - rescale_bits
+    scale = ((1 << (N + k)) + (1 << k)) // nr_kernel_elements
+    shift = N + k
+
+    assert shift < (1 << 6)
+
+    return scale, shift
+
+
+# Calculate elementwise Mul OFM scale+shift
+def elementwise_mul_scale(input_scale, input2_scale, output_scale):
+    output_rescale = (input_scale * input2_scale) / output_scale
+    out_scale, out_shift = quantise_scale(output_rescale)
+    return out_scale, out_shift
+
+
+# Simplified version of calculating elementwise Add/Sub scales
+def simplified_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, input_shift=16):
+    max_input_scale = max(input1_scale, input2_scale)
+
+    input1_rescale = input1_scale * (1 << input_shift) / (2 * max_input_scale)
+    input2_rescale = input2_scale * (1 << input_shift) / (2 * max_input_scale)
+    output_rescale = (2 * max_input_scale) / (output_scale * (1 << input_shift))
+
+    out_scale, out_shift = quantise_scale(output_rescale)
+
+    return input1_rescale, input2_rescale, out_scale, out_shift
+
+
+# Advanced version of calculating elementwise Add/Sub scales
+def advanced_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, bitdepth):
+    # Always scale the smaller of the input scales
+    max_input_scale = max(input1_scale, input2_scale)
+    min_input_scale = min(input1_scale, input2_scale)
+    input_shift = 20 if bitdepth == 8 else 14
+    op_to_scale = OperandToScale.OPa if input1_scale < input2_scale else OperandToScale.OPb
+
+    input1_rescale, _, out_scale, out_shift = simplified_elementwise_add_sub_scale(
+        min_input_scale, max_input_scale, output_scale, input_shift
+    )
+
+    in_scale, in_shift = quantise_scale(input1_rescale)
+
+    return in_scale, in_shift, out_scale, out_shift, op_to_scale
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
new file mode 100644
index 0000000..c35c156
--- /dev/null
+++ b/ethosu/vela/scheduler.py
@@ -0,0 +1,949 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# The scheduler costs various strategies for scheduling the network in order to select the block configuration.
+
+import enum
+from .nn_graph import (
+    TensorPurpose,
+    TensorSubPurpose,
+    TensorFormat,
+    MemArea,
+    SchedulingStrategy,
+    CascadedPass,
+    PassPlacement,
+    SchedulerRewrite,
+    Operation,
+    NpuBlockType,
+)
+from . import live_range
+import numpy as np
+from . import npu_performance
+from . import stats_writer
+from .npu_performance import make_bandwidth_array, make_macs_array, make_cycles_array, make_metrics_arrays, PassCycles
+import time, copy
+from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list
+from .shared_buffer_allocation import (
+    find_block_configs_suitable_for_pass_and_shared_buffer,
+    shared_buffer_allocation_for_pass_and_block_config,
+)
+from functools import lru_cache
+
+
+class ParetoMetric(enum.Enum):
+    BwCycMem = 1
+    BwCycMemBlkH = 2
+
+    def __str__(self):
+        return self.name
+
+
+class SchedulerOptions:
+    def __init__(
+        self,
+        use_cascading=True,
+        use_ifm_ofm_overlap=True,
+        verbose_schedule=False,
+        verbose_pareto_frontier_schedules=False,
+        use_ifm_streaming=True,
+        pareto_metric=ParetoMetric.BwCycMem,
+    ):
+        self.use_cascading = use_cascading
+        self.use_ifm_ofm_overlap = use_ifm_ofm_overlap
+        self.verbose_schedule = verbose_schedule
+        self.verbose_pareto_frontier_schedules = verbose_pareto_frontier_schedules
+        self.use_ifm_streaming = use_ifm_streaming
+        self.pareto_metric = pareto_metric
+
+    def __str__(self):
+        return type(self).__name__ + ": " + str(self.__dict__)
+
+    __repr__ = __str__
+
+
+class Strategy:
+    __slots__ = "strat", "param", "passes", "block_configs", "rewrite_list", "bws", "macs", "cycles", "sram_used"
+
+    def __init__(self, strat, param, passes, block_configs, rewrite_list, bws, macs, cycles, sram_used):
+        self.strat = strat
+        self.param = param
+        self.passes = passes
+        self.block_configs = block_configs
+        self.rewrite_list = (
+            rewrite_list  # list of (SchedulerRewrite, Tensor, new sub purpose, purpose param a, purpose param b, pass)
+        )
+        self.bws = bws
+        self.macs = macs
+        self.cycles = cycles
+        self.sram_used = sram_used
+
+    def __eq__(self, other):
+        if self.strat != other.strat:
+            return False
+        if self.param != other.param:
+            return False
+        if self.block_configs != other.block_configs:
+            return False
+        if self.passes != other.passes:
+            return False
+        if (self.bws != other.bws).any():
+            return False
+        if (self.macs != other.macs).any():
+            return False
+        if (self.cycles != other.cycles).any():
+            return False
+        if self.sram_used != other.sram_used:
+            return False
+        return True
+
+    def empty(self):
+        return not self.passes
+
+    def key(self):
+        return self.passes[-1]
+
+    def clone(self):
+        return Strategy(
+            self.strat,
+            self.param,
+            self.passes,
+            self.block_configs,
+            self.rewrite_list,
+            self.bws,
+            self.macs,
+            self.cycles,
+            self.sram_used,
+        )
+
+    def __str__(self):
+        return "<scheduler.Strategy: %s %s %s %s %s %s %s>" % (
+            self.strat,
+            self.passes,
+            self.rewrite_list,
+            self.bws,
+            self.macs,
+            self.cycles,
+            self.sram_used,
+        )
+
+    __repr__ = __str__
+
+
+class StrategySet:
+    __slots__ = "strats", "bws", "macs", "cycles", "max_sram_used", "total_sram_used"
+
+    def __init__(self, strats=None):
+        if strats is None:
+            strats = dict()
+        self.strats = strats  # final pass in packed pass -> Strategy
+        self.bws, self.macs, self.cycles = make_metrics_arrays()
+        self.max_sram_used = 0
+        self.total_sram_used = 0
+
+    def update_statistics(self):
+        self.bws = make_bandwidth_array()
+        self.max_sram_used = 0
+        for ps, strat in self.strats.items():
+            self.bws += strat.bws
+            self.macs += strat.macs
+            self.cycles += strat.cycles
+            self.max_sram_used = max(self.max_sram_used, strat.sram_used)
+            self.total_sram_used += strat.sram_used
+
+    def clone_add_strategy(self, new_strat):
+        key = new_strat.key()
+        if key in self.strats:
+            assert new_strat == self.strats[key]
+            return self
+        else:
+            new_strats = dict(self.strats)
+            new_strats[key] = new_strat
+            new_set = StrategySet(new_strats)
+            new_set.bws = self.bws + new_strat.bws
+            new_set.macs = self.macs + new_strat.macs
+            new_set.cycles = self.cycles + new_strat.cycles
+            new_set.max_sram_used = max(self.max_sram_used, new_strat.sram_used)
+            new_set.total_sram_used = self.total_sram_used + new_strat.sram_used
+            return new_set
+
+    def __eq__(self, other):
+        if (self.bws != other.bws).any():
+            return False
+        if (self.macs != other.macs).any():
+            return False
+        if (self.cycles != other.cycles).any():
+            return False
+        if self.max_sram_used != other.max_sram_used:
+            return False
+        if self.total_sram_used != other.total_sram_used:
+            return False
+        if self.strats != other.strats:
+            return False
+        return True
+
+    def __str__(self):
+        return "<scheduler.StrategySet: max_sram_used=%s passes_covered=%s>" % (
+            self.max_sram_used,
+            list(ps.name for ps in self.strats),
+        )
+
+    __repr__ = __str__
+
+
+empty_strategy = Strategy(
+    SchedulingStrategy.Unknown, None, [], [], [], make_bandwidth_array(), make_macs_array(), make_cycles_array(), 0
+)
+INFINITY = 1e30
+
+ABORT_SEARCH = []
+
+
+def flatten_list_of_lists(lstlst):
+    lst = []
+    for v in lstlst:
+        lst.extend(v)
+    return lst
+
+
+class DynamicProgrammingScheduler:
+    def __init__(self, nng, sg, arch, sram_limit, options: SchedulerOptions):
+        self.nng = nng
+        self.sg = sg
+        self.arch = arch
+        self.sram_limit = sram_limit
+        self.options = copy.copy(options)
+        self.use_cascading = options.use_cascading
+
+        if self.arch.feature_map_storage_mem_area != MemArea.Sram:
+            self.use_ifm_ofm_overlap = False  # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM
+        self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
+
+        self.verbose_schedule = options.verbose_schedule
+        self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules
+        self.mem_area = MemArea.Sram
+
+        self.bandwidth_weights = arch.bandwidth_weights
+        self.cycles_weight = arch.cycles_weight
+        self.max_sram_used_weight = arch.max_sram_used_weight
+
+        self.n_combinations_searched = 0
+
+        self.feature_maps_not_in_fast_storage = (
+            arch.tensor_storage_mem_area[TensorPurpose.FeatureMap] != arch.fast_storage_mem_area
+        )
+
+        self.pareto_max_candidates = 16
+
+        self.ifm_stream_npu_blocks = set(
+            (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling,)
+        )
+
+    num_pareto_metrics = 4
+    view_values = ",".join(["d"] * num_pareto_metrics)
+    order_values = ["f%d" % (idx,) for idx in range(num_pareto_metrics)]
+
+    def pareto_metric(self, candidate):
+        strat, strat_set = candidate
+        total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total]
+        bws = strat.bws + strat_set.bws
+        last_block_height = 0
+        if self.options.pareto_metric == ParetoMetric.BwCycMemBlkH and len(strat.block_configs) > 0:
+            last_block_height = strat.block_configs[-1][0]
+
+        return (
+            np.tensordot(bws, self.bandwidth_weights, axes=3) + total_cycles * self.cycles_weight,
+            strat_set.max_sram_used,
+            strat.sram_used,
+            last_block_height,
+        )
+
+    def filter_pareto_frontier(self, candidates, remove_equally_good_candidates):
+
+        candidates = [cand for cand in candidates if max(cand[0].sram_used, cand[1].max_sram_used) <= self.sram_limit]
+
+        if len(candidates) <= 1:
+            return candidates
+        assert remove_equally_good_candidates
+        start = time.time()
+        pareto_vals = np.zeros((len(candidates), DynamicProgrammingScheduler.num_pareto_metrics))
+        ids = np.arange(len(candidates), dtype=np.int32)
+        for idx, cand in enumerate(candidates):
+            pareto_vals[idx] = self.pareto_metric(cand)
+
+        sort_order = np.argsort(
+            pareto_vals.view(DynamicProgrammingScheduler.view_values),
+            order=DynamicProgrammingScheduler.order_values,
+            axis=0,
+            kind="stable",
+        ).flatten()
+        pareto_vals = pareto_vals[sort_order]
+        ids = ids[sort_order]
+
+        pareto_frontier = []
+        while len(ids) > 0:
+            pareto_frontier.append(candidates[ids[0]])
+            not_dominated_by_first = (pareto_vals < pareto_vals[0]).any(axis=1)
+            ids = ids[not_dominated_by_first]
+            pareto_vals = pareto_vals[not_dominated_by_first]
+
+        if len(pareto_frontier) > self.pareto_max_candidates:
+            pareto_frontier = self.sort_by_candidate_metric(pareto_frontier)
+            pareto_frontier = pareto_frontier[: self.pareto_max_candidates]
+
+        return pareto_frontier
+
+    def candidate_metric(self, candidate):
+        strat, strat_set = candidate
+        max_sram_used = max(strat_set.max_sram_used, strat.sram_used)
+        bws = strat.bws + strat_set.bws
+        total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total]
+
+        return (
+            max_sram_used * self.max_sram_used_weight
+            + np.tensordot(bws, self.bandwidth_weights, axes=3)
+            + total_cycles * self.cycles_weight
+        )
+
+    def sort_by_candidate_metric(self, candidate_list):
+        sorted_list = list(sorted(candidate_list, key=self.candidate_metric))
+        return sorted_list
+
+    def best_candidate(self, candidate_list):
+        if len(candidate_list) == 0:
+            return ABORT_SEARCH
+        if len(candidate_list) == 1:
+            return candidate_list[0]
+        sorted_list = self.sort_by_candidate_metric(candidate_list)
+        return sorted_list[0]
+
+    def graduate_strat(self, strat_type, sram_used, old_strat_data):
+        res = []
+        for old_strat, old_strat_set in old_strat_data:
+            if old_strat.sram_used + sram_used > self.sram_limit:
+                continue  # This strategy is bad, drop it
+            if old_strat_set.max_sram_used > self.sram_limit:
+                continue  # This strategy is bad, drop it
+            assert old_strat.strat == SchedulingStrategy.Unknown
+
+            new_strat = old_strat.clone()
+            new_strat.strat = strat_type
+            new_strat.sram_used = old_strat.sram_used + sram_used
+
+            if self.use_ifm_ofm_overlap:
+                overlap = calc_allowed_ofm_ifm_overlap_for_pass_list(
+                    new_strat.strat, new_strat.passes, new_strat.block_configs
+                )
+                new_strat.sram_used -= overlap
+
+            new_strat_set = old_strat_set.clone_add_strategy(new_strat)
+            res.append((empty_strategy, new_strat_set))
+        return self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+
+    def append_sram(self, sram_used, old_strat_data):
+        res = []
+        for old_strat, strat_set in old_strat_data:
+            assert old_strat.strat == SchedulingStrategy.Unknown
+            assert old_strat.sram_used == 0
+            new_strat = old_strat.clone()
+            new_strat.sram_used = old_strat.sram_used + sram_used
+
+            res.append((new_strat, strat_set))
+        return res
+
+    def append_sram_block_config_performance_metrics(self, sram_used, block_config, metrics, old_strat_data):
+        res = []
+        for old_strat, strat_set in old_strat_data:
+            assert old_strat.strat == SchedulingStrategy.Unknown
+            new_strat = old_strat.clone()
+            bws, macs, cycles = metrics[:3]
+
+            new_strat.sram_used = old_strat.sram_used + sram_used
+            new_strat.block_configs = old_strat.block_configs + [block_config]
+            new_strat.bws = old_strat.bws + bws
+            new_strat.macs = old_strat.macs + macs
+            new_strat.cycles = old_strat.cycles + cycles
+            new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass(
+                self.arch, new_strat.bws, new_strat.macs, new_strat.cycles
+            )
+
+            res.append((new_strat, strat_set))
+        return res
+
+    def append_sram_pass_block_config_performance_metrics_rewrite_list(
+        self, sram_used, new_pass, block_config, metrics, rewrite_list, old_strat_data
+    ):
+        res = []
+        for old_strat, strat_set in old_strat_data:
+            assert old_strat.strat == SchedulingStrategy.Unknown
+            new_strat = old_strat.clone()
+            bws, macs, cycles = metrics[:3]
+            new_strat.sram_used = old_strat.sram_used + sram_used
+            new_strat.block_configs = old_strat.block_configs + [block_config]
+            new_strat.bws = old_strat.bws + bws
+            new_strat.macs = old_strat.macs + macs
+            new_strat.cycles = old_strat.cycles + cycles
+            new_strat.passes = old_strat.passes + [new_pass]
+            new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass(
+                self.arch, new_strat.bws, new_strat.macs, new_strat.cycles
+            )
+            new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list
+            res.append((new_strat, strat_set))
+        return res
+
+    def append_sram_rewrite_list(self, sram_used, rewrite_list, old_strat_data):
+        res = []
+        for old_strat, strat_set in old_strat_data:
+            assert old_strat.strat == SchedulingStrategy.Unknown
+            new_strat = old_strat.clone()
+            new_strat.sram_used = old_strat.sram_used + sram_used
+            new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list
+            res.append((new_strat, strat_set))
+        return res
+
+    def pass_to_strat(self, strat_data):
+        res = {}
+        for strat in strat_data[1].strats.values():
+            for ps in strat.passes:
+                res[ps] = strat
+        return res
+
+    def compatible_strats(self, a, b):
+        intersection = a.keys() & b.keys()
+        for k in intersection:
+            if a[k] != b[k]:
+                return False
+        return True
+
+    def collate_strats_for_passes(self, all_passes):
+        if len(all_passes) == 0:
+            return [(empty_strategy, StrategySet(dict()))]
+        if len(all_passes) == 1:
+            return all_passes[0]  # save some space in the common case
+        all_strands = [[self.pass_to_strat(strat_data) for strat_data in strand] for strand in all_passes]
+        prev_combos = [dict()]
+        for j, strand in enumerate(all_strands):
+            new_combos = []
+            for i, alt in enumerate(strand):
+                for prev in prev_combos:
+                    if self.compatible_strats(prev, alt):
+                        cmb = dict(prev)
+                        cmb.update(all_passes[j][i][1].strats)
+                        new_combos.append(cmb)
+            prev_combos = new_combos
+
+        res = []
+        for d in prev_combos:
+            s = StrategySet(d)
+            s.update_statistics()
+            res.append((empty_strategy, s))
+        return res
+
+    def search_all_but_one_predecessor(self, ps, pred_pass, pred_pass_data):
+        # get the rest of the predecessors
+        other_predecessors = [pred for pred in ps.dag_predecessors if pred != pred_pass]
+        other_predecessor_data = self.search_pass_list(other_predecessors)
+
+        # pred strat data has an incomplete strategy, which we need
+        # to continue on, whereas the other ones have completed strategies.
+        # we need to merge these, but keep the incomplete strategy too.
+
+        res = []
+        for pred_pass_strat, pred_pass_strat_set in pred_pass_data:
+            all_strats = [
+                [(empty_strategy, pred_pass_strat_set)],  # pred strat data but with a dummy empty strategy
+                other_predecessor_data,  # this one is fine to use as-is
+            ]
+            collated_strat_data = self.collate_strats_for_passes(all_strats)
+            strat_data = [(pred_pass_strat, strat_set) for _, strat_set in collated_strat_data]
+            res.extend(strat_data)
+        return res
+
+    def calc_non_local_mem_usage(self):
+        ignore_subgraph_input_output_tensors = self.sg.placement == PassPlacement.Cpu
+        range_set = live_range.extract_live_ranges_from_passes(
+            self.sg,
+            self.mem_area,
+            mark_output_tensors_overlapping_with_input_tensors=True,
+            ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
+        )
+        range_dict = range_set.ranges
+
+        # find which ranges overlap passes but aren't input/outputs of the passes.
+        # these won't be counted by the dynamic programming search and must be counted in manually.
+        end_pos = max(ps.time for ps in self.sg.passes) + 2
+        mem_usage = np.zeros(end_pos) + self.sg.base_sram_used
+        non_local_mem_usage = np.zeros(end_pos, dtype=np.int64)
+
+        for tens, rng in range_dict.items():
+            storage_size = tens.storage_size()
+            assert tens.mem_area == self.mem_area
+            mem_usage[rng.start_time : rng.end_time] += storage_size
+
+        for ps in self.sg.passes:
+            local_mem_usage = 0
+            for tens in ps.inputs + ps.outputs + ps.intermediates:
+                if tens.mem_area != self.mem_area:
+                    continue
+
+                local_mem_usage += tens.storage_size()
+
+            non_local_mem_usage[ps.time] = mem_usage[ps.time] - local_mem_usage
+
+        self.non_local_mem_usage = non_local_mem_usage
+
+    def search(self):
+        self.calc_non_local_mem_usage()
+        starting_passes = [ps for ps in self.sg.passes if not ps.successors]
+        strat_data = self.search_pass_list(starting_passes)
+
+        _, best_set = self.best_candidate(strat_data)
+
+        if self.verbose_pareto_frontier_schedules:
+            print(
+                "Scheduler searched %d combinations and found %d candidate schedules along the pareto frontier"
+                % (self.n_combinations_searched, len(strat_data,))
+            )
+            for idx, (_, strat_set) in enumerate(strat_data):
+                extra = ""
+                if strat_set == best_set:
+                    extra = "(Best candidate)"
+                print("Candidate", idx, extra)
+                memory_used = {MemArea.Sram: strat_set.max_sram_used}
+                stats_writer.print_performance_metrics_for_strat(
+                    self.arch,
+                    "",
+                    strat_set.cycles,
+                    strat_set.macs,
+                    strat_set.bws,
+                    self.nng.batch_size,
+                    memory_used,
+                    len(self.sg.passes),
+                    len(strat_set.strats),
+                )
+
+        return best_set
+
+    def search_pass_list(self, pass_list):
+        all_strats = []
+        for ps in pass_list:
+            strat = self.search_output(ps)
+            all_strats.append(strat)
+        strat_data = self.collate_strats_for_passes(all_strats)
+        for strd in strat_data:
+            for ps in pass_list:
+                assert ps in strd[1].strats  # should have strategies for everything we asked to search
+        return strat_data
+
+    def search_predecessors(self, ps):
+
+        # protect against graphs with loops. collate_strats_for_passes will sort this out later so that
+        # we have strats for all passes
+
+        pass_list = ps.dag_predecessors
+        strat_data = self.search_pass_list(pass_list)
+
+        return strat_data
+
+    @lru_cache(maxsize=None)
+    def search_output(self, ps):
+
+        assert ps in self.sg.passes
+        candidate_list = []
+
+        candidate_list.extend(self.search_weight_streaming_output(ps))
+
+        if self.options.use_ifm_streaming:
+            candidate_list.extend(self.search_ifm_streaming_output(ps))
+
+        best = self.filter_pareto_frontier(candidate_list, remove_equally_good_candidates=True)
+
+        if not best:
+            print(
+                "Warning: Dynamic search programming algorithm failed for pass %s, invoking fallback strategy"
+                % (ps.name,)
+            )
+            return self.search_predecessors(ps)
+
+        return best
+
+    def search_ifm_streaming_output(self, ps):
+        if ps.placement != PassPlacement.Npu:
+            return ABORT_SEARCH
+        if ps.npu_block_type not in self.ifm_stream_npu_blocks:
+            return ABORT_SEARCH
+        strat_data = self.search_ifm_streaming_body(ps, False)
+
+        sram_used = self.non_local_mem_usage[ps.time]
+        for tens in ps.outputs:
+            if tens.mem_area == self.mem_area:
+                sram_used += tens.storage_size()
+
+        return self.graduate_strat(SchedulingStrategy.IfmStream, sram_used, strat_data)
+
+    @lru_cache(maxsize=None)
+    def search_ifm_streaming_body(self, ps, force_outputs_to_fast_storage):
+        if ps.placement != PassPlacement.Npu:
+            return ABORT_SEARCH
+        if ps.npu_block_type not in self.ifm_stream_npu_blocks:
+            return ABORT_SEARCH
+        ifm_input_search_resuls = self.search_ifm_streaming_input(ps)
+        res = []
+
+        base_sram_used = 0
+        for tens in ps.intermediates:
+            if tens.mem_area == self.mem_area:
+                base_sram_used += tens.storage_size()
+
+        all_block_configs = self.get_block_configs(ps)
+        for block_config in all_block_configs:
+            all_strats = []
+
+            if self.use_cascading:
+                all_strats.extend(self.search_ifm_streaming_partial(ps, block_config))
+
+            all_strats.extend(ifm_input_search_resuls)
+
+            rewrite_list = []
+            sram_used = base_sram_used
+
+            metrics = npu_performance.performance_metrics_for_pass(
+                self.arch,
+                ps,
+                block_config,
+                rewrite_list=rewrite_list,
+                force_outputs_to_fast_storage=force_outputs_to_fast_storage,
+            )
+
+            res.extend(
+                self.append_sram_pass_block_config_performance_metrics_rewrite_list(
+                    sram_used, ps, block_config, metrics, rewrite_list, all_strats
+                )
+            )
+
+        self.n_combinations_searched += len(res)
+        res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+        return res
+
+    def search_ifm_streaming_partial(self, ps, block_config):
+        if ps.placement != PassPlacement.Npu:
+            return ABORT_SEARCH
+
+        if len(ps.inputs) < 1:
+            return ABORT_SEARCH
+
+        ifm_tensor = ps.ifm_tensor
+
+        if ifm_tensor is None:
+            return ABORT_SEARCH
+        if ifm_tensor.purpose != TensorPurpose.FeatureMap:
+            return ABORT_SEARCH
+        if not ifm_tensor.storage_shape or len(ifm_tensor.storage_shape) != 4:
+            return ABORT_SEARCH
+
+        pred_pass_list = []
+        for pred_candidate in ps.dag_predecessors:
+            if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor:
+                # we found a predecessor that produces this IFM tensor
+                if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
+                    # and it only has one successor, namely us
+                    if pred_candidate.placement == PassPlacement.Npu:
+                        if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
+                            # and it is on the Npu and fusable - it's a candidate
+                            pred_pass_list.append(pred_candidate)
+
+        if not pred_pass_list:
+            return ABORT_SEARCH
+
+        all_candidates = []
+        for pred_pass in pred_pass_list:
+            # recurse into the next pass
+            ifm_strat_data = self.search_ifm_streaming_body(pred_pass, self.feature_maps_not_in_fast_storage)
+
+            strat_data = self.search_all_but_one_predecessor(ps, pred_pass, ifm_strat_data)
+            for strat_opt in strat_data:
+
+                pred_pass_block_config = strat_opt[0].block_configs[-1]
+                rolling_buffer_dims = npu_performance.rolling_buffer_dims_from_passes(
+                    self.arch, pred_pass, pred_pass_block_config, ps, block_config
+                )
+                if rolling_buffer_dims is None:
+                    continue  # this does not pack properly, skip it.
+
+                sram_used = 0
+                for tens in ps.inputs:
+                    if tens != ifm_tensor:
+                        if tens.mem_area == self.mem_area:
+                            sram_used += tens.storage_size()
+
+                rolling_buffer_y, rolling_buffer_x = rolling_buffer_dims
+
+                rewrite_list = [
+                    (
+                        SchedulerRewrite.ChangeTensorSubPurpose,
+                        ifm_tensor,
+                        TensorSubPurpose.RollingBufferY,
+                        rolling_buffer_y,
+                        None,
+                        ps,
+                    )
+                ]
+                sram_used += ifm_tensor.storage_size_for_sub_purpose(
+                    TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
+                )
+
+                all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt]))
+
+        self.n_combinations_searched += len(all_candidates)
+        return all_candidates
+
+    def get_block_configs(self, ps):
+        if ps.placement != PassPlacement.Npu:
+            return [(1, 1, 1, 1)] # default
+
+        block_configs = find_block_configs_suitable_for_pass_and_shared_buffer(self.arch, ps)
+
+        # Take a limited number of the largest blocks
+        if self.arch.block_config_limit > 0:
+            # Sort by block area, followed by depth
+            block_configs.sort(key=lambda cfg: (cfg[0] * cfg[1]) << 8 | cfg[3], reverse=True)
+            bound = min(len(block_configs), self.arch.block_config_limit)
+            # We take 'n' from the fat end of the list, and 'n' from the thin end of the list.
+            tmp = block_configs[:bound]
+            tmp.extend(block_configs[max(bound, len(block_configs) - bound) :])
+            block_configs = tmp
+
+        return block_configs
+
+    def search_ifm_streaming_input(self, ps):
+        sram_used = 0
+        for tens in ps.inputs:
+            if tens.mem_area == self.mem_area:
+                sram_used += tens.storage_size()
+
+        return self.append_sram(sram_used, self.search_predecessors(ps))
+
+    def search_weight_streaming_output(self, ps):
+        strat_data = self.search_weight_streaming_body(ps)
+
+        sram_used = self.non_local_mem_usage[ps.time]
+        for tens in ps.outputs:
+            if tens.mem_area == self.mem_area:
+                sram_used += tens.storage_size()
+
+        return self.graduate_strat(SchedulingStrategy.WeightStream, sram_used, strat_data)
+
+    @lru_cache(maxsize=None)
+    def search_weight_streaming_body(self, ps):
+
+        strat_data = self.search_weight_streaming_input(ps)
+
+        res = []
+
+        all_block_configs = self.get_block_configs(ps)
+
+        for block_config in all_block_configs:
+
+            sram_used = 0
+            rewrite_list = []
+
+            for tens in ps.intermediates:
+                if tens.mem_area == self.mem_area:
+                    if tens.purpose == TensorPurpose.Weights:
+                        sram_used += tens.storage_size_for_sub_purpose(
+                            TensorSubPurpose.DoubleBuffer, block_config[3]
+                        )
+                        rewrite_list.append(
+                            (
+                                SchedulerRewrite.ChangeTensorSubPurpose,
+                                tens,
+                                TensorSubPurpose.DoubleBuffer,
+                                block_config[3],
+                                None,
+                                ps,
+                            )
+                        )
+                    else:
+                        sram_used += tens.storage_size()
+
+            metrics = npu_performance.performance_metrics_for_pass(
+                self.arch, ps, block_config, rewrite_list=rewrite_list
+            )
+
+            res.extend(
+                self.append_sram_pass_block_config_performance_metrics_rewrite_list(
+                    sram_used, ps, block_config, metrics, rewrite_list, strat_data
+                )
+            )
+
+        self.n_combinations_searched += len(res)
+        res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+        return res
+
+    def search_weight_streaming_input(self, ps):
+        sram_used = 0
+        for tens in ps.inputs:
+            if tens.mem_area == self.mem_area:
+                sram_used += tens.storage_size()
+
+        return self.append_sram(sram_used, self.search_predecessors(ps))
+
+    def apply_result(self, strat_set, arch):
+        pass_to_cascaded_pass = dict()
+        for _, strat in strat_set.strats.items():
+            # rewrite the tensors that need this first. e.g. make rolling buffers
+            inputs = []
+            intermediates = []
+            outputs = []
+
+            for ps in strat.passes:
+                inputs += ps.inputs
+                intermediates += ps.intermediates
+                outputs += ps.outputs
+
+            for tens in set(inputs) & set(outputs):
+                # tensors that are in both sets are intermediates
+
+                # find pass with input/output tensor, and check if they are both placed on NPU
+                input_placement = None
+                output_placement = None
+                for ps in strat.passes:
+                    if tens in ps.inputs:
+                        input_placement = ps.placement
+                    if tens in ps.outputs:
+                        output_placement = ps.placement
+                if input_placement == output_placement == PassPlacement.Npu:
+                    tens.set_format(TensorFormat.NHCWB16, arch)
+
+                intermediates.append(tens)
+                inputs.remove(tens)
+                outputs.remove(tens)
+
+            for rewrite_op, tens, sub_purpose, param_a, param_b, ps in strat.rewrite_list:
+                if rewrite_op == SchedulerRewrite.ChangeTensorSubPurpose:
+                    tens.mem_area = self.arch.fast_storage_mem_area
+                    tens.set_new_sub_purpose(sub_purpose, param_a, param_b)
+                else:
+                    assert 0, "unknown rewrite_op " + str(rewrite_op)
+
+            is_element_wise = True
+            for ps in strat.passes:
+                assert ps.placement == strat.passes[0].placement
+                if not ps.is_element_wise:
+                    is_element_wise = False
+                    break
+
+            cascaded_pass = CascadedPass(
+                strat.passes[0].name,
+                strat.strat,
+                inputs,
+                intermediates,
+                outputs,
+                strat.passes,
+                strat.passes[0].placement,
+                is_element_wise,
+            )
+            assert strat.sram_used >= 0
+            cascaded_pass.sram_used = strat.sram_used
+
+            for idx, ps in enumerate(strat.passes):
+                assert ps not in pass_to_cascaded_pass
+                pass_to_cascaded_pass[ps] = cascaded_pass
+                ps.cascade = cascaded_pass
+                ps.block_config = strat.block_configs[idx]
+
+                if ps.placement == PassPlacement.Npu:
+                    ps.shared_buffer = shared_buffer_allocation_for_pass_and_block_config(
+                        self.arch, ps, ps.block_config
+                    )
+                    assert ps.shared_buffer is not None
+
+                for op in ps.ops:
+                    subgraph = op.attrs.get("subgraph")
+                    if subgraph:
+                        subgraph.base_sram_used = cascaded_pass.sram_used
+
+        # all passes should have a cascaded pass now
+        if len(pass_to_cascaded_pass) != len(self.sg.passes):
+            print(
+                "mismatch: we have %d passes, but only %d have cascaded passes associated"
+                % (len(self.sg.passes), len(pass_to_cascaded_pass))
+            )
+            for ps in self.sg.passes:
+                if not ps in pass_to_cascaded_pass:
+                    print("%3d pass missing cascaded pass %s" % (ps.time, ps))
+
+            assert len(pass_to_cascaded_pass) == len(self.sg.passes)
+        # we have all the passes, but we need to put them in order and build predecessor/successor links.
+
+        visit_pass_set = set()
+        cascaded_passes = []
+
+        def visit_pass(ps):
+            if ps in visit_pass_set:
+                return
+            visit_pass_set.add(ps)
+
+            cps = ps.cascade
+            dont_traverse = set(cps.passes)
+
+            for ps in cps.passes:
+                for pred in ps.predecessors:
+                    if pred in dont_traverse:
+                        continue
+                    visit_pass(pred)
+
+            cascaded_passes.append(cps)
+
+        starting_passes = [ps for ps in self.sg.passes if not ps.successors]
+        for ps in starting_passes:
+            visit_pass(ps)
+
+        # reorder so startup init cascaded passes come first
+        def is_startup_cascaded_pass(cps):
+            if not cps.passes:
+                return False
+            return cps.placement == PassPlacement.StartupInit
+
+        cascaded_passes = [cps for cps in cascaded_passes if is_startup_cascaded_pass(cps)] + [
+            cps for cps in cascaded_passes if not is_startup_cascaded_pass(cps)
+        ]
+
+        self.sg.cascaded_passes = cascaded_passes
+        self.sg.build_cascaded_pass_links()
+
+
+def schedule_passes(nng, arch, options: SchedulerOptions):
+
+    for sg in nng.subgraphs:
+        sg.base_sram_used = 0
+
+    for sg in nng.subgraphs:
+        # re-entering the same nodes from different contexts requires us to
+        # build a simplified directed acyclic (DAG) version of the graph to
+        # use for traversal, rather than using a visit dictionary. this avoids
+        # recursing infinitely due to loops.
+        sg.build_pass_dag_predecessors()
+
+        dps = DynamicProgrammingScheduler(nng, sg, arch, arch.sram_size, options)
+
+        strat_set = dps.search()
+
+        dps.apply_result(strat_set, arch)
+
+        if options.verbose_schedule:
+            sg.print_cascaded_passes()
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
new file mode 100644
index 0000000..b5408d1
--- /dev/null
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -0,0 +1,199 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
+
+import numpy as np
+from .nn_graph import NpuBlockType
+from .numeric_util import round_up_divide, round_up
+from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures
+from . import pass_packing
+
+
+class SharedBufferAllocation:
+    def __init__(self, arch, ps):
+        self.arch = arch
+
+        self.bank_locations = np.zeros(SharedBufferArea.Size)
+        self.banks_required = np.zeros(SharedBufferArea.Size)
+
+        ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
+
+        strides = (1, 1, 1, 1)
+        dilation = (1, 1, 1, 1)
+        self.kernel = Kernel(1, 1)
+        is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
+
+        if ps.primary_op:
+            strides = ps.primary_op.attrs.get("strides", strides)
+            dilation = ps.primary_op.attrs.get("dilation", dilation)
+            k_h = 1
+            k_w = 1
+            if weight_tensor:
+                if ps.primary_op.type != "FullyConnectedAct":
+                    k_h = weight_tensor.shape[0]
+                    k_w = weight_tensor.shape[1]
+            else:
+                k_h = ps.primary_op.attrs.get("filter_height", 1)
+                k_w = ps.primary_op.attrs.get("filter_width", 1)
+
+            self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+
+        self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
+            NpuBlockType.ConvolutionDepthWise,
+            NpuBlockType.Pooling,
+        )
+        self.strides = strides
+
+        self.use_accumulator_element = SHRAMElements.Acc32
+        if is_elementwise:
+            self.use_ifm_element = SHRAMElements.IFM8_Elementwise
+        else:
+            self.use_ifm_element = SHRAMElements.IFM8
+
+        self.ifm_bits = 0
+        self.ifm_depth = 0
+        if ifm_tensor:
+            self.ifm_bits = ifm_tensor.dtype.size_in_bits()
+            if ifm_tensor.shape == [] and is_elementwise:
+                # Elementwise operator with scalar in ifm, use ifm2 depth
+                self.ifm_depth = ifm2_tensor.shape[-1]
+            else:
+                self.ifm_depth = ifm_tensor.shape[-1]
+            if self.ifm_bits == 16:
+                self.use_accumulator_element = SHRAMElements.Acc40
+                self.use_ifm_element = self.use_ifm_element + 1
+                assert (self.use_ifm_element == SHRAMElements.IFM16) or (
+                    self.use_ifm_element == SHRAMElements.IFM16_Elementwise
+                )
+            else:
+                assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
+
+        self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
+        self.ofm_tensor = ofm_tensor
+
+        self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
+        self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
+
+    def is_valid(self):
+        # Assign zero-based bank starts (first element remains zero)
+        self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
+
+        # Accumulator area is measured from the end of the buffer
+        self.bank_locations[SharedBufferArea.Accumulators] = (
+            self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
+        )
+        ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
+        return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
+
+    def try_block(self, ofm_block: Block):
+        # Get IFM block configuration
+        ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
+        ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel)
+        ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
+        if ifm_config is None:
+            return None
+
+        # Get OFM block configuration
+        ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
+        if ofm_config is None:
+            return None
+
+        # Update bank counts for IFM and Accumulator
+        self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]
+        self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]
+
+        # Validating calculates bank layout and returns validity
+        if not self.is_valid():
+            return None
+
+        return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
+
+    def generate_used_mask(self, active_set):
+        res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
+        for kind in active_set:
+            start = int(self.bank_locations[kind])
+            end = start + int(self.banks_required[kind])
+            res[start:end] = 1
+        return res
+
+    def is_compatible(first, second):
+        """See if the bank allocations of two convolutions are compatible,
+        so that they can run back-to-back without a fence in between"""
+
+        first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
+        second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
+
+        first_mask = first.generate_used_mask(first_set)
+        second_mask = second.generate_used_mask(second_set)
+
+        if np.sum(first_mask & second_mask):
+            # overlap
+            return False
+
+        return True
+
+
+def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
+    alloc = SharedBufferAllocation(arch, ps)
+    assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
+    if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
+        return alloc
+
+    return None
+
+
+def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
+    alloc = SharedBufferAllocation(arch, ps)
+
+    if arch.override_block_config:
+        config = alloc.try_block(arch.override_block_config)
+        assert config, "Block config override cannot be used"
+        return [config]
+
+    # Constrain the search space if the OFM is smaller than the max block size
+    # - Add other block search constraints here if required
+    if len(alloc.ofm_tensor.shape) == 2:
+        max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
+    else:
+        max_block_width = alloc.ofm_tensor.shape[-2]
+        max_block_height = alloc.ofm_tensor.shape[-3]
+
+    # Common block depth
+    max_block_depth = alloc.ofm_tensor.shape[-1]
+
+    # Constrain to valid ranges before search
+    max_block_width = min(arch.ofm_block_max.width, max_block_width)
+    max_block_height = min(arch.ofm_block_max.height, max_block_height)
+    max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
+
+    valid_block_configs = []
+    # Try a range of block shapes against this pass
+    for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
+        for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
+            # Try valid OFM block depths
+            for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
+                # OFM block depth has the constraint that if it causes the OFM to be
+                # split, it must be a multiple of the OFM split size
+                if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
+                    config = alloc.try_block(Block(w, h, c))
+                    if config:
+                        valid_block_configs.append(config)
+
+    assert len(valid_block_configs) > 0
+    return valid_block_configs
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
new file mode 100644
index 0000000..c4b4cd9
--- /dev/null
+++ b/ethosu/vela/stats_writer.py
@@ -0,0 +1,367 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Writes out per-pass and summary performance statistics to CSV files.
+
+import numpy as np
+from .nn_graph import MemArea, TensorPurpose, PassPlacement
+from .npu_performance import PassCycles, MacCount, BandwidthDirection
+import csv
+from .numeric_util import round_up_to_int
+import sys
+
+
+def write_summary_metrics_csv(nng, summary_filename, arch):
+    with open(summary_filename, "w") as f:
+        writer = csv.writer(f)
+
+        labels = [
+            "experiment",
+            "network",
+        ]
+
+        labels += (
+            ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
+            + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
+            + ["weights_storage_area", "feature_map_storage_area"]
+        )
+
+        labels += [
+            "inferences_per_second",
+            "batch_size",
+            "inference_time",
+            "passes_before_fusing",
+            "passes_after_fusing",
+        ]
+        labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
+        labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
+
+        for mem_area in MemArea.all():
+            labels += [
+                mem_area.identifier_name() + "_feature_map_read_bytes",
+                mem_area.identifier_name() + "_feature_map_write_bytes",
+                mem_area.identifier_name() + "_weight_read_bytes",
+                mem_area.identifier_name() + "_weight_write_bytes",
+                mem_area.identifier_name() + "_total_bytes",
+            ]
+
+        labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
+
+        labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
+
+        writer.writerow(labels)
+
+        data_items = [
+            "default",
+            nng.name,
+        ]
+
+        if arch:
+            data_items += (
+                [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
+                + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
+                + [
+                    arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
+                    arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
+                ]
+            )
+
+        midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
+        midpoint_fps = 1 / midpoint_inference_time
+
+        n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+        n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+
+        data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
+        data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
+
+        data_items += [
+            nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
+            nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
+        ]
+
+        for mem_area in MemArea.all():
+            bws = nng.bandwidths[mem_area]
+            total_bw = np.sum(bws)
+            weight_bws = bws[TensorPurpose.Weights]
+            fm_bws = bws[TensorPurpose.FeatureMap]
+            data_items += [
+                fm_bws[BandwidthDirection.Read],
+                fm_bws[BandwidthDirection.Write],
+                weight_bws[BandwidthDirection.Read],
+                weight_bws[BandwidthDirection.Write],
+                total_bw,
+            ]
+
+        data_items += [
+            nng.macs[MacCount.NeuralNetworkMacs],
+            nng.macs[MacCount.HardwareMacs],
+            nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
+            nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
+        ]
+
+        data_items += [nng.cycles[kind] for kind in PassCycles.all()]
+
+        writer.writerow(data_items)
+
+
+def write_pass_metrics_csv(nng, pass_filename):
+
+    with open(pass_filename, "w") as f:
+        writer = csv.writer(f)
+
+        purpose_list = (
+            ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
+            ("weights", (TensorPurpose.Weights,)),
+            ("feature_map", (TensorPurpose.FeatureMap,)),
+        )
+
+        direction_list = (
+            ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
+            ("read", (BandwidthDirection.Read,)),
+            ("write", (BandwidthDirection.Write,)),
+        )
+        bandwidth_names = []
+        bandwidth_indices = []
+        for mem_area in MemArea.all():
+            for purpose, purpose_candidates in purpose_list:
+                for direction, direction_candidates in direction_list:
+                    label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
+                    bandwidth_names.append(label)
+                    bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
+
+        all_macs = MacCount.all()
+        all_cycles = (
+            PassCycles.Total,
+            PassCycles.Dpu,
+            PassCycles.ElementWise,
+            PassCycles.Cpu,
+            PassCycles.SramAccess,
+            PassCycles.DramAccess,
+            PassCycles.OnChipFlashAccess,
+            PassCycles.OffChipFlashAccess,
+        )
+        writer.writerow(
+            [
+                "name",
+                "operators",
+                "placement",
+                "streaming_strategy",
+                "block_config_height",
+                "block_config_width",
+                "block_config_input_channels",
+                "block_config_output_channels",
+                "n_blocks_in_pass",
+            ]
+            + ["cycles_" + v.identifier_name() for v in all_cycles]
+            + [v.identifier_name() for v in all_macs]
+            + bandwidth_names
+            + ["sram_used"]
+        )
+
+        def write_subgraph(sg):
+            for cps in sg.cascaded_passes:
+                if cps.placement == PassPlacement.StartupInit:
+                    continue  # skip the dummy init pass
+
+                for ps in cps.passes:
+                    if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
+                        # just treat this as a call, unroll it
+                        write_subgraph(ps.ops[0].attrs["subgraph"])
+                        continue
+                    stats = [ps.name, " ".join(op.type for op in ps.ops)]
+                    stats += [ps.placement.name]
+                    stats += [cps.strategy.name]
+                    stats += list(ps.block_config)
+                    stats += [ps.n_blocks]
+                    stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
+                    stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
+                    for indices in bandwidth_indices:
+                        res = 0
+                        i = indices[0]
+                        for j in indices[1]:
+                            for k in indices[2]:
+                                res += round_up_to_int(ps.bandwidths[i, j, k])
+                        stats.append(res)
+                    stats += [ps.sram_used]
+
+                    writer.writerow(stats)
+
+        write_subgraph(nng.get_root_subgraph())
+
+
+def print_performance_metrics_for_strat(
+    arch,
+    name,
+    cycles,
+    macs,
+    bandwidths,
+    batch_size,
+    memory_used,
+    num_passes,
+    num_cascaded_passes,
+    n_operations=0,
+    cpu_operations=[],
+    bits_per_element=None,
+    show_cpu_operations=False,
+    f=sys.stdout,
+):
+
+    orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
+
+    midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
+    midpoint_fps = 1 / midpoint_inference_time
+
+    mem_area_labels = [
+        (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
+    ]
+
+    if name:
+        print("", file=f)
+        print("Network summary for", name, file=f)
+    print("Accelerator configuration        %20s" % (arch.accelerator_config,), file=f)
+    print("System configuration             %20s" % (arch.system_config,), file=f)
+    print("Accelerator clock                        %12d MHz" % (arch.npu_clock / 1e6,), file=f)
+    for mem_area, label in mem_area_labels:
+        print(
+            "Design peak %-25s    %12.2f GB/s"
+            % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
+            file=f,
+        )
+
+    print(file=f)
+    for mem_area, label in mem_area_labels:
+        if not mem_area in memory_used:
+            continue
+
+        aug_label = label + " used"
+
+        extra = ""
+        if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
+            extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
+
+        print("Total %-25s          %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
+
+    print(file=f)
+    print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
+
+    n_cpu_operations = len(cpu_operations)
+    if n_operations > 0:
+        print(
+            "%d/%d (%4.1f %%) operations falling back to the CPU"
+            % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
+            file=f,
+        )
+
+    if show_cpu_operations:
+        for op in cpu_operations:
+
+            def format_tens_list(lst):
+                return " ".join(str(list(tens.shape)) for tens in lst)
+
+            print(
+                "CPU operation: %s, inputs %s, outputs %s"
+                % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
+                file=f,
+            )
+
+        print("", file=f)
+
+    for mem_area, label in mem_area_labels:
+        bws = bandwidths[mem_area]
+        total_bw = np.sum(bws)
+        weight_bws = bws[TensorPurpose.Weights]
+        fm_bws = bws[TensorPurpose.FeatureMap]
+        aug_label = label + " bandwidth"
+        print(
+            "Average %-25s        %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print(
+            "Input   %-25s        %12.2f MB/batch"
+            % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print("Weight  %-25s        %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
+        print(
+            "Output  %-25s        %12.2f MB/batch"
+            % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print("Total   %-25s        %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
+        print(
+            "Total   %-25s per input %9.2f MB/inference (batch size %d)"
+            % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
+            file=f,
+        )
+        print(file=f)
+
+    print("Neural network macs                      %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
+    print("Hardware macs                            %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
+    print(
+        "Network Tops/s                           %12.2f Tops/s"
+        % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
+        file=f,
+    )
+    print(
+        "Hardware Tops/s                          %12.2f Tops/s"
+        % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
+        file=f,
+    )
+    print(file=f)
+
+    for kind in PassCycles.all():
+        aug_label = kind.display_name() + " cycles"
+        cyc = cycles[kind]
+        print("%-30s           %12d cycles/batch" % (aug_label, cyc,), file=f)
+    print(file=f)
+
+    print(
+        "Batch Inference time              %7.2f ms, %7.2f inferences/s (batch size %d)"
+        % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
+        file=f,
+    )
+    print(file=f)
+
+
+def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+    n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+    n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+    n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
+    cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
+    return print_performance_metrics_for_strat(
+        arch,
+        nng.name,
+        nng.cycles,
+        nng.macs,
+        nng.bandwidths,
+        nng.batch_size,
+        nng.memory_used,
+        n_passes,
+        n_cascaded_passes,
+        n_operations,
+        cpu_operations,
+        nng.bits_per_element,
+        show_cpu_operations,
+        f,
+    )
+
+
+def write_human_friendly_metrics(nng, arch, filename):
+    f = open(filename, "w")
+    print_performance_metrics(nng, arch, f=f)
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
new file mode 100644
index 0000000..23135f8
--- /dev/null
+++ b/ethosu/vela/supported_operators.py
@@ -0,0 +1,243 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# The SupportedOperators class which is a collection of all supported operators and parameter checks.
+
+from .data_type import BaseType
+
+
+class SupportedOperators:
+    def __init__(self):
+        # Categorised lists of supported operators
+        self.npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead"))
+        self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched"))
+        self.depthwise_convolution_ops = set(
+            ("DepthwiseConv2dBiasAct", "DepthwiseConv2dNative", "QuantizedDepthwiseConv2D")
+        )
+        self.max_pooling_ops = set(("QuantizedMaxPool", "MaxPool", "MaxPoolAct"))
+        self.avg_pooling_ops = set(("QuantizedAvgPool", "AvgPool", "AvgPoolAct"))
+        self.pooling_ops = self.max_pooling_ops | self.avg_pooling_ops
+        self.fc_vector_products = set(("QuantizedMatMul", "MatMul", "FullyConnectedAct"))
+        self.mac_main_ops = (
+            # convolutions
+            self.convolution_ops
+            # depth-wise convolutions
+            | self.depthwise_convolution_ops
+            # pooling
+            | self.pooling_ops
+            # FC layers
+            | self.fc_vector_products
+            # RNN/LSTM/GRU
+            | set(("BlockLSTM"))
+        )
+        self.elem_wise_main_ops = set(
+            (
+                # element-wise
+                "AddAct",
+                "MulAct",
+                "SubAct",
+                "QuantizedAdd",
+                "QuantizedSub",
+                "QuantizedMul",
+                "Mul",
+                "Add",
+                "Sub",
+                "Minimum",
+                "Maximum",
+            )
+        )
+        self.activation_ops = set(
+            ("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh")
+        )
+        self.npu_post_ops = (
+            # activation functions
+            self.activation_ops
+            # concatenation write direction
+            | set(("ConcatSliceWrite"))
+            # bias add and batch norm
+            | set(("QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm"))
+        )
+        self.split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped", "Unpack"))
+        self.concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped", "Pack"))
+        self.memory_only_ops = (
+            set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")) | self.concat_ops | self.split_ops
+        )
+        self.supported_fused_activations = set(("Relu", "Relu6", "ReluN1To1", "Tanh", "Sigmoid"))
+        self.supported_operators = (
+            self.npu_pre_ops | self.mac_main_ops | self.elem_wise_main_ops | self.npu_post_ops | self.memory_only_ops
+        )
+        # Setup supported operator restriction checkers
+        self.supported_operator_restrictions = {}
+        self.supported_operator_restrictions.update(
+            {op: self.check_convolution_restrictions for op in self.convolution_ops}
+        )
+        self.supported_operator_restrictions.update(
+            {op: self.check_depthwise_convolution_restrictions for op in self.depthwise_convolution_ops}
+        )
+        self.supported_operator_restrictions.update({op: self.check_pooling_restrictions for op in self.pooling_ops})
+        self.supported_operator_restrictions.update(
+            {op: self.check_vector_product_restrictions for op in self.fc_vector_products}
+        )
+        self.supported_operator_restrictions.update(
+            {op: self.check_element_wise_restrictions for op in self.elem_wise_main_ops}
+        )
+        self.supported_operator_restrictions.update(
+            {op: self.check_memory_only_restrictions for op in self.memory_only_ops}
+        )
+
+    def is_operator_supported(self, op):
+        if op.type not in self.supported_operators:
+            return False
+        if not self.check_generic_restrictions(op):
+            return False
+        if op.type in self.supported_operator_restrictions:
+            return self.supported_operator_restrictions[op.type](op)
+        return True
+
+    def check_generic_restrictions(self, op):
+        # check fully defined shapes
+        for t in op.inputs + op.outputs:
+            if not t.has_fully_defined_shape():
+                print("Warning:", op, "has inputs/outputs of undefined shape, placing on CPU")
+                return False
+
+        # check data type
+        tensors = [t for t in op.get_ifm_ifm2_weights_ofm() if t is not None]
+        if not tensors:
+            tensors = op.inputs
+        for t in tensors:
+            if not (t.dtype.type & BaseType.Int):
+                return False
+            if t.element_size() > 2 and op.type != "Requantize":
+                return False
+            # check size
+            if any(dim > 65536 for dim in t.shape):
+                return False
+
+        # check fused activations
+        if (
+            "fused_activation_function" in op.attrs
+            and op.attrs["fused_activation_function"] is not None
+            and op.attrs["fused_activation_function"] not in self.supported_fused_activations
+        ):
+            return False
+        return True
+
+    def check_convolution_restrictions(self, op):
+        # check stride
+        if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2:
+            return False
+
+        # check dilation
+        dilation_w_factor = op.attrs.get("dilation_w_factor", 1)
+        dilation_h_factor = op.attrs.get("dilation_h_factor", 1)
+        if dilation_w_factor > 2 or dilation_h_factor > 2:
+            return False
+
+        # check data type
+        ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
+        if weight_tensor.element_size() > 1:
+            return False
+
+        # check kernel size
+        dilated_weight_w = weight_tensor.shape[0] + (weight_tensor.shape[0] - 1) * (dilation_w_factor - 1)
+        dilated_weight_h = weight_tensor.shape[1] + (weight_tensor.shape[1] - 1) * (dilation_h_factor - 1)
+        if (
+            dilated_weight_w > 64
+            or dilated_weight_h > 64
+            or dilated_weight_w * dilated_weight_h * weight_tensor.shape[2] > 127 * 65536
+        ):
+            return False
+
+        # check batch size
+        if ifm_tensor.shape[0] != 1:
+            return False
+        return True
+
+    def check_depthwise_convolution_restrictions(self, op):
+        # check depth
+        ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+        if op.attrs["depth_multiplier"] > 1 and not (
+            (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"])
+        ):
+            return False
+        return self.check_convolution_restrictions(op)
+
+    def check_pooling_restrictions(self, op):
+        # check stride
+        if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2:
+            return False
+
+        # check data type
+        ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+        if ifm_tensor.dtype != ofm_tensor.dtype:
+            return False
+
+        # check batch size
+        if ifm_tensor.shape[0] != 1:
+            return False
+
+        if op.type in self.avg_pooling_ops:
+            # check kernel size
+            if op.attrs["padding"] == b"SAME" and (op.attrs["filter_width"] > 8 or op.attrs["filter_height"] > 8):
+                return False
+            if op.attrs["padding"] == b"VALID" and (op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256):
+                return False
+
+        if op.type in self.max_pooling_ops:
+            # check data type
+            if not ifm_tensor.dtype == ofm_tensor.dtype:
+                return False
+            # check kernel size
+            if op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256:  # any padding
+                return False
+        return True
+
+    def check_vector_product_restrictions(self, op):
+        # check data type
+        ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
+        if weight_tensor.element_size() > 1:
+            return False
+
+        return True
+
+    def check_element_wise_restrictions(self, op):
+        # check data type
+        ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+        if op.type in ("Minimum", "Maximum") and ifm_tensor.dtype != ofm_tensor.dtype:
+            return False
+
+        # check batch size
+        if (len(ifm_tensor.shape) > 2 and ifm_tensor.shape[0] != 1) or (
+            len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1
+        ):
+            return False
+
+        # check scalar size
+        if (hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1) or (
+            hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1
+        ):
+            return False
+        return True
+
+    def check_memory_only_restrictions(self, op):
+        # check stride size
+        if op.type == "StridedSlice":
+            if len(op.inputs) > 3 and any(stride != 1 for stride in op.inputs[3].values):
+                return False
+        return True
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
new file mode 100644
index 0000000..46040a4
--- /dev/null
+++ b/ethosu/vela/tensor.py
@@ -0,0 +1,629 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Internal representation of a Neural Network Tensor.
+
+import enum
+from . import numeric_util
+import numpy as np
+from . import data_type
+import uuid
+from .range_set import MemoryRangeSet
+from .numeric_util import round_up_divide
+
+
+class MemArea(enum.IntFlag):
+    Unknown = 0
+    Sram = 1
+    Dram = 2
+    OnChipFlash = 3
+    OffChipFlash = 4
+    Size = OffChipFlash + 1
+
+    def display_name(self):
+        return ("Unknown", "SRAM", "DRAM", "On-chip Flash", "Off-chip Flash", "Size")[self.value]
+
+    def identifier_name(self):
+        return ("unknown", "sram", "dram", "on_chip_flash", "off_chip_flash", "size")[self.value]
+
+    def all():
+        return (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash)
+
+    def __str__(self):
+        return self.name
+
+
+class TensorPurpose(enum.IntFlag):
+    Unknown = 0
+    Weights = 1
+    FeatureMap = 2
+    Scratch = 3
+    Size = 4
+
+    def display_name(self):
+        return ("Unknown", "Weights", "FeatureMap", "Scratch", "Size")[self.value]
+
+    def identifier_name(self):
+        return ("unknown", "weights", "feature_map", "scratch", "size")[self.value]
+
+    def all():
+        return (TensorPurpose.Weights, TensorPurpose.FeatureMap)
+
+
+class TensorSubPurpose(enum.Enum):
+    Standard = 0
+    DoubleBuffer = 1
+    RollingBufferX = 2
+    RollingBufferY = 3
+    RollingBufferXY = 4
+
+    def display_name(self):
+        return ("Standard", "Double Buffer", "Rolling Buffer X", "Rolling Buffer Y", "Rolling Buffer XY")[self.value]
+
+    def identifier_name(self):
+        return ("standard", "double_buffer", "rolling_buffer_x", "rolling_buffer_y", "rolling_buffer_xy")[self.value]
+
+    def all():
+        return (
+            TensorSubPurpose.Standard,
+            TensorSubPurpose.DoubleBuffer,
+            TensorSubPurpose.RollingBufferX,
+            TensorSubPurpose.RollingBufferY,
+            TensorSubPurpose.RollingBufferXY,
+        )
+
+
+class TensorFormat(enum.Flag):
+    Unknown = 0
+    WeightsCompressed = 1
+    NHWC = 2
+    NHCWB16 = 3
+
+    def __str__(self):
+        return self.name
+
+
+class TensorBlockTraversal(enum.Enum):
+    Default = 0
+    DepthWise = 1
+    DepthFirst = 2
+    PartKernelFirst = 3
+
+
+def shape_num_elements(shp):
+    elems = 1
+    if shp is None:
+        return None
+    for d in shp:
+        if d is None:
+            return None
+        elems *= d
+    return elems
+
+
+def shape_fully_defined(shp):
+    if shp is None:
+        return False
+    for d in shp:
+        if d is None:
+            return False
+    return True
+
+
+def shape_round_to_quantum(shp, quantum):
+    new_shp = list(shp)
+
+    # Traverse backwards using length of shape since there may be more rounding quantums than shape elements
+    for i in range(-1, -len(shp) - 1, -1):
+        if new_shp[i] is not None:
+            new_shp[i] = numeric_util.round_up(new_shp[i], quantum[i])
+    return new_shp
+
+
+class QuantizationParameters:
+    __slots__ = "min", "max", "num_bits", "narrow_range", "scale_f32", "zero_point", "quant_min", "quant_max"
+
+    def __init__(self, min=None, max=None, num_bits=None, narrow_range=None):
+        self.min = min
+        self.max = max
+
+        self.num_bits = num_bits
+        self.narrow_range = narrow_range
+
+        self.scale_f32 = None
+        self.zero_point = None
+        self.quant_min = None
+        self.quant_max = None
+
+    def __str__(self):
+        return "<nng.QuantizationParameters min=%s max=%s, num_bits=%s, scale=%s, zero_point=%s>" % (
+            self.min,
+            self.max,
+            self.num_bits,
+            self.scale_f32,
+            self.zero_point,
+        )
+
+    __repr__ = __str__
+
+    def clone(self):
+        res = QuantizationParameters()
+        res.min = self.min
+        res.max = self.max
+
+        res.num_bits = self.num_bits
+        res.narrow_range = self.narrow_range
+
+        res.scale_f32 = self.scale_f32
+        res.zero_point = self.zero_point
+        res.quant_min = self.quant_min
+        res.quant_max = self.quant_max
+        return res
+
+    def dequantize(self, values):
+        if self.zero_point.size == 1 and self.scale_f32.size == 1:
+            # same scale is used for all values
+            res = (values.astype(np.float64) - self.zero_point) * self.scale_f32
+        else:
+            # a different scale is used for different sets of values
+            values_as_float = values.astype(np.float64)
+
+            # this is not compatible with the format of depthwise weights,
+            # where input is at index 3 (Output, Kh, Kw, Input)
+            # return the quantized values
+            return np.ndarray((values_as_float.shape))
+
+            shape = values_as_float.shape[0]
+            assert self.zero_point.size == self.scale_f32.size == shape
+            res = np.ndarray(values_as_float.shape)
+            for i in range(shape):
+                res[i] = (values_as_float[i] - self.zero_point[i]) * self.scale_f32[i]
+
+        return res
+
+
+class Tensor:
+    __slots__ = (
+        "shape",
+        "storage_shape",
+        "bandwidth_shape",
+        "dtype",
+        "name",
+        "ops",
+        "consumer_list",
+        "values",
+        "quant_values",
+        "compressed_values",
+        "mem_area",
+        "format",
+        "purpose",
+        "sub_purpose",
+        "alignment",
+        "weight_transpose_depthwise",
+        "storage_compression_scale",
+        "bandwidth_compression_scale",
+        "compression_scale_for_worst_weight_stream",
+        "weight_compression_scales",
+        "weight_compression_config",
+        "storage_rounding_quantum",
+        "brick_size",
+        "address",
+        "quantization",
+        "weight_compressed_offsets",
+        "element_size_bytes",
+        "reshaped",
+        "block_traversal",
+        "offset",
+        "cpu_tensor",
+        "npu_tensor",
+        "equivalence_id",
+    )
+    AllocationQuantum = 16
+
+    def __init__(self, shape, dtype, name):
+        self.shape = shape
+        self.storage_shape = shape
+        self.bandwidth_shape = shape
+        self.dtype = dtype
+        self.name = name
+        self.equivalence_id = uuid.uuid4()
+
+        self.ops = []
+        self.consumer_list = []
+        # Below attributes are only set if a tensor has been cloned,
+        # either from Cpu -> Npu or vice versa. Needed for offline allocation
+        self.cpu_tensor = None  # reference to the corresponding Cpu tensor
+        self.npu_tensor = None  # reference to the corresponding Npu tensor
+
+        self.values = None
+        self.quant_values = None
+        self.compressed_values = None
+        self.mem_area = MemArea.Unknown
+        self.format = TensorFormat.Unknown
+        self.purpose = TensorPurpose.Unknown
+        self.sub_purpose = TensorSubPurpose.Standard
+        self.alignment = Tensor.AllocationQuantum
+        self.weight_transpose_depthwise = False
+
+        self.storage_compression_scale = 1.0
+        self.bandwidth_compression_scale = 1.0
+        self.compression_scale_for_worst_weight_stream = 1.0
+        self.weight_compression_scales = None
+        self.weight_compression_config = None
+        self.weight_compressed_offsets = []
+        self.storage_rounding_quantum = (1, 1, 1, 1)
+        self.brick_size = (1, 1, 1, 1)
+        self.address = 0  # start address of tensor. will be filled in by tensor allocator
+        self.element_size_bytes = 0
+
+        # quantization parameters
+        self.quantization = None
+
+        self.reshaped = False
+        self.block_traversal = TensorBlockTraversal.Default
+
+    def element_size(self):
+        if self.element_size_bytes == 0:
+            return self.dtype.size_in_bits() / 8
+        return self.element_size_bytes
+
+    def clone(self, suffix="_clone"):
+        res = Tensor(self.shape, self.dtype, self.name + suffix)
+        res.storage_shape = list(self.storage_shape)
+        res.bandwidth_shape = list(self.bandwidth_shape)
+
+        res.ops = []
+        res.consumer_list = []
+        res.equivalence_id = self.equivalence_id
+
+        res.values = self.values
+        res.quant_values = self.quant_values
+        res.compressed_values = self.compressed_values
+        res.mem_area = self.mem_area
+        res.format = self.format
+        res.purpose = self.purpose
+        res.sub_purpose = self.sub_purpose
+        res.alignment = self.alignment
+        res.weight_transpose_depthwise = self.weight_transpose_depthwise
+
+        res.storage_compression_scale = self.storage_compression_scale
+        res.bandwidth_compression_scale = self.bandwidth_compression_scale
+        res.compression_scale_for_worst_weight_stream = self.compression_scale_for_worst_weight_stream
+        res.weight_compression_scales = self.weight_compression_scales
+        res.storage_rounding_quantum = self.storage_rounding_quantum
+        res.brick_size = self.brick_size
+        res.address = 0
+
+        if self.quantization is not None:
+            res.quantization = self.quantization.clone()
+        else:
+            res.quantization = None
+
+        return res
+
+    def clone_into_fast_storage(self, arch):
+        res = self.clone(suffix="_fast_storage")
+        res.mem_area = arch.fast_storage_mem_area
+        return res
+
+    def set_format(self, fmt, arch):
+        self.format = fmt
+        shape_len = 0
+        try:
+            shape_len = len(self.shape)
+        except TypeError:
+            pass
+
+        self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format]
+        self.storage_rounding_quantum = self.storage_rounding_quantum[-shape_len:]
+        if self.format == TensorFormat.NHCWB16:
+            self.storage_rounding_quantum = self.storage_rounding_quantum[:-1] + (
+                int(self.storage_rounding_quantum[-1] / self.dtype.size_in_bytes()),
+            )
+        self.brick_size = arch.brick_sizes[self.format]
+        self.brick_size = self.brick_size[-shape_len:]
+        if self.shape is None:
+            return
+
+        self.bandwidth_shape = shape_round_to_quantum(self.shape, self.brick_size)
+        self.storage_shape = shape_round_to_quantum(self.shape, self.storage_rounding_quantum)
+
+        if fmt == TensorFormat.WeightsCompressed:
+            compression_ratio = 5 / 8
+            self.storage_compression_scale = compression_ratio
+            self.bandwidth_compression_scale = compression_ratio
+            self.compression_scale_for_worst_weight_stream = compression_ratio
+
+    def storage_elements(self):
+        elems = shape_num_elements(self.storage_shape)
+        if elems is None:
+            return 0
+        return elems
+
+    def elements(self):
+        elems = shape_num_elements(self.shape)
+        if elems is None:
+            return 0
+        return elems
+
+    def has_fully_defined_shape(self):
+        return shape_fully_defined(self.shape)
+
+    def storage_size(self):
+        raw_size = self.storage_elements() * self.element_size()
+        if raw_size == 0:
+            raw_size = 1  # force it to take up space
+        rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
+        return rounded_size
+
+    def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+        alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
+        elems = shape_num_elements(alt_shape)
+        if elems is None:
+            return 0
+        if sub_purpose == TensorSubPurpose.DoubleBuffer:
+            raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream
+        else:
+            raw_size = elems * self.element_size() * self.storage_compression_scale
+        rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
+        return rounded_size
+
+    def storage_shape_for_sub_purpose(self, sub_purpose, param_a, param_b):
+        shp = list(self.storage_shape)
+        if sub_purpose == TensorSubPurpose.DoubleBuffer:
+            assert len(shp) >= 2
+            shp[-1] = min(shp[-1], param_a * 2)
+        elif sub_purpose == TensorSubPurpose.RollingBufferX:
+            assert len(shp) == 4
+            shp[0] = 1
+            shp[2] = min(shp[2], param_a)
+        elif sub_purpose == TensorSubPurpose.RollingBufferY:
+            assert len(shp) == 4
+            shp[0] = 1
+            shp[1] = min(shp[1], param_a)
+        elif sub_purpose == TensorSubPurpose.RollingBufferXY:
+            assert len(shp) == 4
+            shp[0] = 1
+            shp[2] = min(shp[2], param_a)
+            shp[1] = min(shp[1], param_b)
+        elif sub_purpose == TensorSubPurpose.Standard:
+            pass
+        else:
+            assert 0, "did not expect new sub purpose %s" % (sub_purpose,)
+        return shp
+
+    def set_new_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+        self.storage_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
+        self.sub_purpose = sub_purpose
+        if sub_purpose == TensorSubPurpose.DoubleBuffer:
+            self.storage_compression_scale = self.compression_scale_for_worst_weight_stream
+
+    def bandwidth(self):
+        elems = shape_num_elements(self.bandwidth_shape)
+        if elems is None:
+            return 0
+        return elems * self.element_size() * self.bandwidth_compression_scale
+
+    def consumers(self):
+        return self.consumer_list
+
+    def get_address_ranges_for_coordinates(self, start_coord, end_coord):
+        if self.sub_purpose in set(
+            (TensorSubPurpose.RollingBufferX, TensorSubPurpose.RollingBufferY, TensorSubPurpose.RollingBufferXY)
+        ):
+            # build dummy coordinates that cover the entire buffer
+            start_coord = [0] * len(start_coord)
+            end_coord = [min(self.storage_shape[i], self.shape[i]) for i in range(len(end_coord))]
+
+        start = self.address_for_coordinate(start_coord, is_top_box=False)
+        end = self.address_for_coordinate(end_coord, is_top_box=True)
+        return MemoryRangeSet(self.mem_area, start, end)
+
+    def addresses_for_rolling_buffer(self, start_coord, end_coord):
+        # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] )
+
+        if len(start_coord) < 4:
+            box_height0 = 1
+            box_width = 1
+
+            if len(start_coord) >= 2:
+                box_width = end_coord[-2] - start_coord[-2]
+
+            return box_height0, box_height0, box_width, [self.address_for_coordinate(start_coord), None, None, None]
+
+        crossing_y = numeric_util.round_up(start_coord[1] + 1, self.storage_shape[1])
+        crossing_x = numeric_util.round_up(start_coord[2] + 1, self.storage_shape[2])
+
+        crossing_y = min(crossing_y, end_coord[1])
+        crossing_x = min(crossing_x, end_coord[2])
+
+        box_height0 = crossing_y - start_coord[1]
+        box_width = crossing_x - start_coord[2]
+
+        addresses = [None] * 4
+        addresses[0] = self.address_for_coordinate(start_coord)
+
+        if end_coord[2] > crossing_x:
+            addresses[1] = self.address_for_coordinate([start_coord[0], start_coord[1], crossing_x, start_coord[3]])
+            raise Exception("Striping in vertical direction is not supported")
+        if end_coord[1] > crossing_y:
+            addresses[2] = self.address_for_coordinate([start_coord[0], crossing_y, start_coord[2], start_coord[3]])
+        if end_coord[1] > crossing_y and end_coord[2] > crossing_x:
+            addresses[3] = self.address_for_coordinate([start_coord[0], crossing_y, crossing_x, start_coord[3]])
+
+        return box_height0, box_height0, box_width, addresses
+
+    def address_for_coordinate(self, coord, is_top_box=False):
+        return self.address + self.address_offset_for_coordinate(coord, is_top_box)
+
+    def get_strides_and_coord(self, coord=None):
+        if coord is None:
+            coord = [0] * len(self.storage_shape)
+
+        augmented_coord = coord
+        augmented_shape = self.storage_shape
+        while len(augmented_shape) < 4:
+            augmented_shape = [1] + augmented_shape
+
+        while len(augmented_coord) < 4:
+            augmented_coord = [0] + augmented_coord
+
+        assert len(augmented_coord) == len(augmented_shape)
+
+        if self.format == TensorFormat.NHWC:
+            augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
+            augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0]
+            stride_order = [4, 1, 3, 2, 0]
+
+        elif self.format == TensorFormat.NHCWB16:
+            channel_divisor = int(16 / self.element_size())
+            augmented_shape = augmented_shape[0:4] + [1]
+            augmented_coord = (
+                [augmented_coord[0], augmented_coord[3] // channel_divisor]
+                + augmented_coord[1:3]
+                + [augmented_coord[3] % channel_divisor]
+            )
+
+            if augmented_shape[1] == 0:
+                augmented_shape[1] = 1
+
+        else:
+            assert self.format in set((TensorFormat.Unknown, TensorFormat.WeightsCompressed))
+            return None, None
+
+        strides = [0] * len(augmented_shape)
+        stride = self.element_size() * self.storage_compression_scale
+
+        if self.format != TensorFormat.NHCWB16:
+            for i in stride_order:
+                strides[i] = stride
+                stride *= augmented_shape[i]
+        else:
+            assert len(strides) == 5
+            channel_divisor = int(16 / self.element_size())
+            strides[4] = stride
+            strides[3] = channel_divisor  # STRIDE_X
+            strides[1] = strides[3] * augmented_shape[2]  # STRIDE_C
+            strides[2] = augmented_shape[2] * augmented_shape[3]  # STRIDE_Y
+            strides[0] = strides[2] * augmented_shape[1]  # STRIDE_N
+
+        return strides, augmented_coord
+
+    def get_strides(self):
+        strides, _ = self.get_strides_and_coord()
+
+        return strides
+
+    def compressed_stream_index_from_coord(self, coord):
+        assert self.format == TensorFormat.WeightsCompressed
+        assert len(self.compressed_values) > 0
+        assert len(self.compressed_values) + 1 == len(self.weight_compressed_offsets)
+
+        depth = coord[-1]
+        brick_depth = self.brick_size[-1]
+        # Clamp position at final element index
+        if depth > self.shape[-1]:
+            depth = self.shape[-1]
+
+        # Always round up to next boundary
+        index = round_up_divide(depth, brick_depth)
+
+        # Check boundaries on all but last weight set (which may be shorter
+        # than the brick we divided it up into)
+        if index < len(self.weight_compressed_offsets) - 1:
+            # There are no half-way points in the weights
+            if (depth % brick_depth) != 0:
+                raise Exception("Offset into weights must be aligned to a brick")
+
+        return index
+
+    def size_of_compressed_stream(self, index):
+        assert 0 <= index < len(self.compressed_values)
+        return len(self.compressed_values[index])
+
+    def is_last_index_in_compressed_stream(self, index):
+        assert 0 <= index < len(self.compressed_values)
+        return index == len(self.compressed_values) - 1
+
+    def address_offset_for_coordinate(self, orig_coord, is_top_box=False):
+        address_offset = 0
+        coord = orig_coord
+
+        coord = coord[-len(self.storage_shape) :]
+
+        if self.sub_purpose == TensorSubPurpose.Standard:
+            for idx, c in enumerate(coord):
+                if is_top_box:
+                    assert c > 0 and c <= self.shape[idx]
+                else:
+                    assert c >= 0 and c < self.shape[idx]
+
+        if self.format == TensorFormat.WeightsCompressed:
+            if len(self.weight_compressed_offsets) == 0:
+                return 0
+
+            if len(self.ops) == 1 and self.ops[0].type == "DMA" and self.sub_purpose == TensorSubPurpose.DoubleBuffer:
+                depth = orig_coord[-1]
+                brick_depth = self.brick_size[-1]
+                # Clamp position at final element index
+                if depth > self.shape[-1]:
+                    depth = self.shape[-1]
+
+                # Always round up to next boundary
+                index = round_up_divide(depth, brick_depth)
+                index = index % 2
+
+                if len(self.compressed_values) <= 2:
+                    if is_top_box and index == 0:
+                        for cv in self.compressed_values:
+                            address_offset += len(cv)
+                    else:
+                        address_offset = index * len(self.compressed_values[0])
+                else:
+                    if is_top_box and index == 0:
+                        address_offset = self.storage_shape[-1]
+                    else:
+                        address_offset = index * (self.storage_shape[-1] // 2)
+            else:
+                index = self.compressed_stream_index_from_coord(orig_coord)
+                assert index < len(self.weight_compressed_offsets)
+                address_offset = self.weight_compressed_offsets[index]
+        else:
+            if is_top_box:
+                coord = [c - 1 for c in coord]
+
+            # handle wraparound for partial buffers. make sure to do this after subtracting top box:
+            coord = [c % self.storage_shape[idx] for idx, c in enumerate(coord)]
+
+            strides, augmented_coord = self.get_strides_and_coord(coord)
+            if strides is None:
+                return None
+
+            if is_top_box:
+                address_offset += 1 * strides[-1]  # one element
+
+            address_offset += np.dot(augmented_coord, strides)
+
+        assert address_offset >= 0
+        assert address_offset <= self.storage_size()
+        return address_offset
+
+    def __str__(self):
+        return "<nng.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.shape, self.dtype)
+
+    __repr__ = __str__
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
new file mode 100644
index 0000000..94aa608
--- /dev/null
+++ b/ethosu/vela/tensor_allocation.py
@@ -0,0 +1,139 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
+# worked out from the allowable overlaps that are calculated by the live range analysis.
+
+from . import live_range
+from .tensor import MemArea
+import math
+from . import numeric_util
+import numpy as np
+from .nn_graph import TensorAllocator, PassPlacement
+
+from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+
+
+def linear_allocate_live_ranges(live_ranges, alloc_granularity=256):
+    total_sz = 0
+    allocated_tensors = []
+
+    # just assign increasing addresses
+    for tens, lr in live_ranges.ranges.items():
+        if tens in allocated_tensors:
+            continue
+
+        lr.set_address(total_sz)
+        allocated_tensors += lr.tensors
+        total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
+
+    return total_sz
+
+
+def mark_sram_used_for_cascaded_passes(sg, lrs):
+    end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
+    mem_usage = np.zeros(end_pos, dtype=np.int64)
+
+    for tens, rng in lrs.ranges.items():
+        storage_size = tens.storage_size()
+        mem_usage[rng.start_time : rng.end_time] += storage_size
+
+    for cps in sg.cascaded_passes:
+        sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
+        cps.sram_used = sram_used
+        for ps in cps.passes:
+            ps.sram_used = sram_used
+
+
+def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
+    if verbose_allocation:
+        if mem_area == MemArea.Sram:
+            print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
+        else:
+            print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
+        for start_time, start, end, name, end_time in sorted(
+            (
+                lr.start_time,
+                tens.address,
+                tens.address + int(math.ceil(tens.storage_size())),
+                tens.name + " " + str(tens.purpose),
+                lr.end_time,
+            )
+            for tens, lr in lrs.ranges.items()
+        ):
+            name = name.replace("\x00", "")
+            print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
+        print()
+
+    if show_minimum_possible_allocation and mem_area == MemArea.Sram:
+        min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
+        print(
+            "Min possible allocation %d bytes / %.1f KB / %.1f MB"
+            % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
+        )
+
+
+def allocate_tensors(
+    nng,
+    sg,
+    arch,
+    mem_area,
+    use_ifm_ofm_overlap=True,
+    tensor_allocator=TensorAllocator.Greedy,
+    verbose_allocation=False,
+    show_minimum_possible_allocation=False,
+    lr_graph=None,
+):
+    ignore_subgraph_input_output_tensors = False
+    lrs = live_range.extract_live_ranges_from_cascaded_passes(
+        sg,
+        mem_area,
+        mark_output_tensors_overlapping_with_input_tensors=False,
+        use_ifm_ofm_overlap=use_ifm_ofm_overlap,
+        ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
+        lr_graph=lr_graph,
+    )
+
+    if lrs.ranges:
+        tens_alloc = tensor_allocator
+        if tens_alloc == TensorAllocator.Greedy:
+            total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
+        elif tens_alloc == TensorAllocator.LinearAlloc:
+            total_sz = linear_allocate_live_ranges(lrs)
+        else:
+            assert 0
+
+        sg.memory_used[mem_area] = total_sz
+
+        nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
+        nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
+
+        print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
+
+        if mem_area == MemArea.Sram:
+            # Mark Sram usage for all subgraphs
+            for sg_ in nng.subgraphs:
+                mark_sram_used_for_cascaded_passes(sg_, lrs)
+
+    if sg == nng.get_root_subgraph():
+        nng.memory_used = sg.memory_used
+        for mem_area in nng.total_elements.keys():
+            try:
+                nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
+            except ZeroDivisionError:
+                nng.bits_per_element[mem_area] = 0.0
diff --git a/ethosu/vela/tflite/AbsOptions.py b/ethosu/vela/tflite/AbsOptions.py
new file mode 100644
index 0000000..0cbfb8c
--- /dev/null
+++ b/ethosu/vela/tflite/AbsOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AbsOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsAbsOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AbsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # AbsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def AbsOptionsStart(builder): builder.StartObject(0)
+def AbsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ActivationFunctionType.py b/ethosu/vela/tflite/ActivationFunctionType.py
new file mode 100644
index 0000000..6d8ec95
--- /dev/null
+++ b/ethosu/vela/tflite/ActivationFunctionType.py
@@ -0,0 +1,11 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class ActivationFunctionType(object):
+    NONE = 0
+    RELU = 1
+    RELU_N1_TO_1 = 2
+    RELU6 = 3
+    TANH = 4
+    SIGN_BIT = 5
diff --git a/ethosu/vela/tflite/AddNOptions.py b/ethosu/vela/tflite/AddNOptions.py
new file mode 100644
index 0000000..b5c2ddb
--- /dev/null
+++ b/ethosu/vela/tflite/AddNOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AddNOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsAddNOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AddNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # AddNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def AddNOptionsStart(builder): builder.StartObject(0)
+def AddNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/AddOptions.py b/ethosu/vela/tflite/AddOptions.py
new file mode 100644
index 0000000..d6cbfcf
--- /dev/null
+++ b/ethosu/vela/tflite/AddOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AddOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsAddOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = AddOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # AddOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # AddOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def AddOptionsStart(builder): builder.StartObject(1)
+def AddOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def AddOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ArgMaxOptions.py b/ethosu/vela/tflite/ArgMaxOptions.py
new file mode 100644
index 0000000..fbf1415
--- /dev/null
+++ b/ethosu/vela/tflite/ArgMaxOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ArgMaxOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsArgMaxOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ArgMaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ArgMaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArgMaxOptions
+    def OutputType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def ArgMaxOptionsStart(builder): builder.StartObject(1)
+def ArgMaxOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0)
+def ArgMaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ArgMinOptions.py b/ethosu/vela/tflite/ArgMinOptions.py
new file mode 100644
index 0000000..120fdca
--- /dev/null
+++ b/ethosu/vela/tflite/ArgMinOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ArgMinOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsArgMinOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ArgMinOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ArgMinOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ArgMinOptions
+    def OutputType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def ArgMinOptionsStart(builder): builder.StartObject(1)
+def ArgMinOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0)
+def ArgMinOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BatchToSpaceNDOptions.py b/ethosu/vela/tflite/BatchToSpaceNDOptions.py
new file mode 100644
index 0000000..3ddcfd3
--- /dev/null
+++ b/ethosu/vela/tflite/BatchToSpaceNDOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BatchToSpaceNDOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsBatchToSpaceNDOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BatchToSpaceNDOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # BatchToSpaceNDOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def BatchToSpaceNDOptionsStart(builder): builder.StartObject(0)
+def BatchToSpaceNDOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py
new file mode 100644
index 0000000..8d8b7be
--- /dev/null
+++ b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,62 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BidirectionalSequenceLSTMOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsBidirectionalSequenceLSTMOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BidirectionalSequenceLSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # BidirectionalSequenceLSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BidirectionalSequenceLSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # BidirectionalSequenceLSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # BidirectionalSequenceLSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # BidirectionalSequenceLSTMOptions
+    def MergeOutputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # BidirectionalSequenceLSTMOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return True
+
+def BidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(5)
+def BidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def BidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def BidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def BidirectionalSequenceLSTMOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(3, mergeOutputs, 0)
+def BidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(4, timeMajor, 1)
+def BidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py
new file mode 100644
index 0000000..673af6b
--- /dev/null
+++ b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BidirectionalSequenceRNNOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsBidirectionalSequenceRNNOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = BidirectionalSequenceRNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # BidirectionalSequenceRNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # BidirectionalSequenceRNNOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # BidirectionalSequenceRNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # BidirectionalSequenceRNNOptions
+    def MergeOutputs(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def BidirectionalSequenceRNNOptionsStart(builder): builder.StartObject(3)
+def BidirectionalSequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0)
+def BidirectionalSequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def BidirectionalSequenceRNNOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(2, mergeOutputs, 0)
+def BidirectionalSequenceRNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Buffer.py b/ethosu/vela/tflite/Buffer.py
new file mode 100644
index 0000000..754dee3
--- /dev/null
+++ b/ethosu/vela/tflite/Buffer.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Buffer(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsBuffer(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Buffer()
+        x.Init(buf, n + offset)
+        return x
+
+    # Buffer
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Buffer
+    def Data(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Buffer
+    def DataAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Buffer
+    def DataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def BufferStart(builder): builder.StartObject(1)
+def BufferAddData(builder, data): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0)
+def BufferStartDataVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def BufferEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BuiltinOperator.py b/ethosu/vela/tflite/BuiltinOperator.py
new file mode 100644
index 0000000..2713653
--- /dev/null
+++ b/ethosu/vela/tflite/BuiltinOperator.py
@@ -0,0 +1,131 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class BuiltinOperator(object):
+    ADD = 0
+    AVERAGE_POOL_2D = 1
+    CONCATENATION = 2
+    CONV_2D = 3
+    DEPTHWISE_CONV_2D = 4
+    DEPTH_TO_SPACE = 5
+    DEQUANTIZE = 6
+    EMBEDDING_LOOKUP = 7
+    FLOOR = 8
+    FULLY_CONNECTED = 9
+    HASHTABLE_LOOKUP = 10
+    L2_NORMALIZATION = 11
+    L2_POOL_2D = 12
+    LOCAL_RESPONSE_NORMALIZATION = 13
+    LOGISTIC = 14
+    LSH_PROJECTION = 15
+    LSTM = 16
+    MAX_POOL_2D = 17
+    MUL = 18
+    RELU = 19
+    RELU_N1_TO_1 = 20
+    RELU6 = 21
+    RESHAPE = 22
+    RESIZE_BILINEAR = 23
+    RNN = 24
+    SOFTMAX = 25
+    SPACE_TO_DEPTH = 26
+    SVDF = 27
+    TANH = 28
+    CONCAT_EMBEDDINGS = 29
+    SKIP_GRAM = 30
+    CALL = 31
+    CUSTOM = 32
+    EMBEDDING_LOOKUP_SPARSE = 33
+    PAD = 34
+    UNIDIRECTIONAL_SEQUENCE_RNN = 35
+    GATHER = 36
+    BATCH_TO_SPACE_ND = 37
+    SPACE_TO_BATCH_ND = 38
+    TRANSPOSE = 39
+    MEAN = 40
+    SUB = 41
+    DIV = 42
+    SQUEEZE = 43
+    UNIDIRECTIONAL_SEQUENCE_LSTM = 44
+    STRIDED_SLICE = 45
+    BIDIRECTIONAL_SEQUENCE_RNN = 46
+    EXP = 47
+    TOPK_V2 = 48
+    SPLIT = 49
+    LOG_SOFTMAX = 50
+    DELEGATE = 51
+    BIDIRECTIONAL_SEQUENCE_LSTM = 52
+    CAST = 53
+    PRELU = 54
+    MAXIMUM = 55
+    ARG_MAX = 56
+    MINIMUM = 57
+    LESS = 58
+    NEG = 59
+    PADV2 = 60
+    GREATER = 61
+    GREATER_EQUAL = 62
+    LESS_EQUAL = 63
+    SELECT = 64
+    SLICE = 65
+    SIN = 66
+    TRANSPOSE_CONV = 67
+    SPARSE_TO_DENSE = 68
+    TILE = 69
+    EXPAND_DIMS = 70
+    EQUAL = 71
+    NOT_EQUAL = 72
+    LOG = 73
+    SUM = 74
+    SQRT = 75
+    RSQRT = 76
+    SHAPE = 77
+    POW = 78
+    ARG_MIN = 79
+    FAKE_QUANT = 80
+    REDUCE_PROD = 81
+    REDUCE_MAX = 82
+    PACK = 83
+    LOGICAL_OR = 84
+    ONE_HOT = 85
+    LOGICAL_AND = 86
+    LOGICAL_NOT = 87
+    UNPACK = 88
+    REDUCE_MIN = 89
+    FLOOR_DIV = 90
+    REDUCE_ANY = 91
+    SQUARE = 92
+    ZEROS_LIKE = 93
+    FILL = 94
+    FLOOR_MOD = 95
+    RANGE = 96
+    RESIZE_NEAREST_NEIGHBOR = 97
+    LEAKY_RELU = 98
+    SQUARED_DIFFERENCE = 99
+    MIRROR_PAD = 100
+    ABS = 101
+    SPLIT_V = 102
+    UNIQUE = 103
+    CEIL = 104
+    REVERSE_V2 = 105
+    ADD_N = 106
+    GATHER_ND = 107
+    COS = 108
+    WHERE = 109
+    RANK = 110
+    ELU = 111
+    REVERSE_SEQUENCE = 112
+    MATRIX_DIAG = 113
+    QUANTIZE = 114
+    MATRIX_SET_DIAG = 115
+    ROUND = 116
+    HARD_SWISH = 117
+    IF = 118
+    WHILE = 119
+    NON_MAX_SUPPRESSION_V4 = 120
+    NON_MAX_SUPPRESSION_V5 = 121
+    SCATTER_ND = 122
+    SELECT_V2 = 123
+    DENSIFY = 124
+    SEGMENT_SUM = 125
diff --git a/ethosu/vela/tflite/BuiltinOptions.py b/ethosu/vela/tflite/BuiltinOptions.py
new file mode 100644
index 0000000..babbcb1
--- /dev/null
+++ b/ethosu/vela/tflite/BuiltinOptions.py
@@ -0,0 +1,106 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class BuiltinOptions(object):
+    NONE = 0
+    Conv2DOptions = 1
+    DepthwiseConv2DOptions = 2
+    ConcatEmbeddingsOptions = 3
+    LSHProjectionOptions = 4
+    Pool2DOptions = 5
+    SVDFOptions = 6
+    RNNOptions = 7
+    FullyConnectedOptions = 8
+    SoftmaxOptions = 9
+    ConcatenationOptions = 10
+    AddOptions = 11
+    L2NormOptions = 12
+    LocalResponseNormalizationOptions = 13
+    LSTMOptions = 14
+    ResizeBilinearOptions = 15
+    CallOptions = 16
+    ReshapeOptions = 17
+    SkipGramOptions = 18
+    SpaceToDepthOptions = 19
+    EmbeddingLookupSparseOptions = 20
+    MulOptions = 21
+    PadOptions = 22
+    GatherOptions = 23
+    BatchToSpaceNDOptions = 24
+    SpaceToBatchNDOptions = 25
+    TransposeOptions = 26
+    ReducerOptions = 27
+    SubOptions = 28
+    DivOptions = 29
+    SqueezeOptions = 30
+    SequenceRNNOptions = 31
+    StridedSliceOptions = 32
+    ExpOptions = 33
+    TopKV2Options = 34
+    SplitOptions = 35
+    LogSoftmaxOptions = 36
+    CastOptions = 37
+    DequantizeOptions = 38
+    MaximumMinimumOptions = 39
+    ArgMaxOptions = 40
+    LessOptions = 41
+    NegOptions = 42
+    PadV2Options = 43
+    GreaterOptions = 44
+    GreaterEqualOptions = 45
+    LessEqualOptions = 46
+    SelectOptions = 47
+    SliceOptions = 48
+    TransposeConvOptions = 49
+    SparseToDenseOptions = 50
+    TileOptions = 51
+    ExpandDimsOptions = 52
+    EqualOptions = 53
+    NotEqualOptions = 54
+    ShapeOptions = 55
+    PowOptions = 56
+    ArgMinOptions = 57
+    FakeQuantOptions = 58
+    PackOptions = 59
+    LogicalOrOptions = 60
+    OneHotOptions = 61
+    LogicalAndOptions = 62
+    LogicalNotOptions = 63
+    UnpackOptions = 64
+    FloorDivOptions = 65
+    SquareOptions = 66
+    ZerosLikeOptions = 67
+    FillOptions = 68
+    BidirectionalSequenceLSTMOptions = 69
+    BidirectionalSequenceRNNOptions = 70
+    UnidirectionalSequenceLSTMOptions = 71
+    FloorModOptions = 72
+    RangeOptions = 73
+    ResizeNearestNeighborOptions = 74
+    LeakyReluOptions = 75
+    SquaredDifferenceOptions = 76
+    MirrorPadOptions = 77
+    AbsOptions = 78
+    SplitVOptions = 79
+    UniqueOptions = 80
+    ReverseV2Options = 81
+    AddNOptions = 82
+    GatherNdOptions = 83
+    CosOptions = 84
+    WhereOptions = 85
+    RankOptions = 86
+    ReverseSequenceOptions = 87
+    MatrixDiagOptions = 88
+    QuantizeOptions = 89
+    MatrixSetDiagOptions = 90
+    HardSwishOptions = 91
+    IfOptions = 92
+    WhileOptions = 93
+    DepthToSpaceOptions = 94
+    NonMaxSuppressionV4Options = 95
+    NonMaxSuppressionV5Options = 96
+    ScatterNdOptions = 97
+    SelectV2Options = 98
+    DensifyOptions = 99
+    SegmentSumOptions = 100
diff --git a/ethosu/vela/tflite/CallOptions.py b/ethosu/vela/tflite/CallOptions.py
new file mode 100644
index 0000000..5ae2eea
--- /dev/null
+++ b/ethosu/vela/tflite/CallOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CallOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsCallOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CallOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # CallOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CallOptions
+    def Subgraph(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+def CallOptionsStart(builder): builder.StartObject(1)
+def CallOptionsAddSubgraph(builder, subgraph): builder.PrependUint32Slot(0, subgraph, 0)
+def CallOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CastOptions.py b/ethosu/vela/tflite/CastOptions.py
new file mode 100644
index 0000000..70ae2e3
--- /dev/null
+++ b/ethosu/vela/tflite/CastOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CastOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsCastOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CastOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # CastOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CastOptions
+    def InDataType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # CastOptions
+    def OutDataType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def CastOptionsStart(builder): builder.StartObject(2)
+def CastOptionsAddInDataType(builder, inDataType): builder.PrependInt8Slot(0, inDataType, 0)
+def CastOptionsAddOutDataType(builder, outDataType): builder.PrependInt8Slot(1, outDataType, 0)
+def CastOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CombinerType.py b/ethosu/vela/tflite/CombinerType.py
new file mode 100644
index 0000000..1e3a61f
--- /dev/null
+++ b/ethosu/vela/tflite/CombinerType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class CombinerType(object):
+    SUM = 0
+    MEAN = 1
+    SQRTN = 2
diff --git a/ethosu/vela/tflite/ConcatEmbeddingsOptions.py b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py
new file mode 100644
index 0000000..9d26c51
--- /dev/null
+++ b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ConcatEmbeddingsOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsConcatEmbeddingsOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ConcatEmbeddingsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ConcatEmbeddingsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ConcatEmbeddingsOptions
+    def NumChannels(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannel(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannelAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def NumColumnsPerChannelLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannel(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannelAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ConcatEmbeddingsOptions
+    def EmbeddingDimPerChannelLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def ConcatEmbeddingsOptionsStart(builder): builder.StartObject(3)
+def ConcatEmbeddingsOptionsAddNumChannels(builder, numChannels): builder.PrependInt32Slot(0, numChannels, 0)
+def ConcatEmbeddingsOptionsAddNumColumnsPerChannel(builder, numColumnsPerChannel): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(numColumnsPerChannel), 0)
+def ConcatEmbeddingsOptionsStartNumColumnsPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ConcatEmbeddingsOptionsAddEmbeddingDimPerChannel(builder, embeddingDimPerChannel): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(embeddingDimPerChannel), 0)
+def ConcatEmbeddingsOptionsStartEmbeddingDimPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ConcatEmbeddingsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ConcatenationOptions.py b/ethosu/vela/tflite/ConcatenationOptions.py
new file mode 100644
index 0000000..c8e0b6a
--- /dev/null
+++ b/ethosu/vela/tflite/ConcatenationOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ConcatenationOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsConcatenationOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ConcatenationOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ConcatenationOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ConcatenationOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ConcatenationOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def ConcatenationOptionsStart(builder): builder.StartObject(2)
+def ConcatenationOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def ConcatenationOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def ConcatenationOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Conv2DOptions.py b/ethosu/vela/tflite/Conv2DOptions.py
new file mode 100644
index 0000000..ef49f75
--- /dev/null
+++ b/ethosu/vela/tflite/Conv2DOptions.py
@@ -0,0 +1,70 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Conv2DOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsConv2DOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Conv2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # Conv2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Conv2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Conv2DOptions
+    def DilationWFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # Conv2DOptions
+    def DilationHFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+def Conv2DOptionsStart(builder): builder.StartObject(6)
+def Conv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def Conv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def Conv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def Conv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(3, fusedActivationFunction, 0)
+def Conv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(4, dilationWFactor, 1)
+def Conv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(5, dilationHFactor, 1)
+def Conv2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CosOptions.py b/ethosu/vela/tflite/CosOptions.py
new file mode 100644
index 0000000..7fbf848
--- /dev/null
+++ b/ethosu/vela/tflite/CosOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CosOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsCosOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CosOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # CosOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def CosOptionsStart(builder): builder.StartObject(0)
+def CosOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CustomOptionsFormat.py b/ethosu/vela/tflite/CustomOptionsFormat.py
new file mode 100644
index 0000000..c2fc07c
--- /dev/null
+++ b/ethosu/vela/tflite/CustomOptionsFormat.py
@@ -0,0 +1,6 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class CustomOptionsFormat(object):
+    FLEXBUFFERS = 0
diff --git a/ethosu/vela/tflite/CustomQuantization.py b/ethosu/vela/tflite/CustomQuantization.py
new file mode 100644
index 0000000..21ec0da
--- /dev/null
+++ b/ethosu/vela/tflite/CustomQuantization.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CustomQuantization(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsCustomQuantization(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = CustomQuantization()
+        x.Init(buf, n + offset)
+        return x
+
+    # CustomQuantization
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # CustomQuantization
+    def Custom(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # CustomQuantization
+    def CustomAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # CustomQuantization
+    def CustomLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def CustomQuantizationStart(builder): builder.StartObject(1)
+def CustomQuantizationAddCustom(builder, custom): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(custom), 0)
+def CustomQuantizationStartCustomVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def CustomQuantizationEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DensifyOptions.py b/ethosu/vela/tflite/DensifyOptions.py
new file mode 100644
index 0000000..12cbfb2
--- /dev/null
+++ b/ethosu/vela/tflite/DensifyOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DensifyOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDensifyOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DensifyOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # DensifyOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def DensifyOptionsStart(builder): builder.StartObject(0)
+def DensifyOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DepthToSpaceOptions.py b/ethosu/vela/tflite/DepthToSpaceOptions.py
new file mode 100644
index 0000000..97b93aa
--- /dev/null
+++ b/ethosu/vela/tflite/DepthToSpaceOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DepthToSpaceOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDepthToSpaceOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DepthToSpaceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # DepthToSpaceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DepthToSpaceOptions
+    def BlockSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def DepthToSpaceOptionsStart(builder): builder.StartObject(1)
+def DepthToSpaceOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0)
+def DepthToSpaceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DepthwiseConv2DOptions.py b/ethosu/vela/tflite/DepthwiseConv2DOptions.py
new file mode 100644
index 0000000..9689383
--- /dev/null
+++ b/ethosu/vela/tflite/DepthwiseConv2DOptions.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DepthwiseConv2DOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDepthwiseConv2DOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DepthwiseConv2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # DepthwiseConv2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DepthwiseConv2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def DepthMultiplier(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DepthwiseConv2DOptions
+    def DilationWFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+    # DepthwiseConv2DOptions
+    def DilationHFactor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+def DepthwiseConv2DOptionsStart(builder): builder.StartObject(7)
+def DepthwiseConv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def DepthwiseConv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def DepthwiseConv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def DepthwiseConv2DOptionsAddDepthMultiplier(builder, depthMultiplier): builder.PrependInt32Slot(3, depthMultiplier, 0)
+def DepthwiseConv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(4, fusedActivationFunction, 0)
+def DepthwiseConv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(5, dilationWFactor, 1)
+def DepthwiseConv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(6, dilationHFactor, 1)
+def DepthwiseConv2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DequantizeOptions.py b/ethosu/vela/tflite/DequantizeOptions.py
new file mode 100644
index 0000000..5ef8b8d
--- /dev/null
+++ b/ethosu/vela/tflite/DequantizeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DequantizeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDequantizeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DequantizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # DequantizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def DequantizeOptionsStart(builder): builder.StartObject(0)
+def DequantizeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DimensionMetadata.py b/ethosu/vela/tflite/DimensionMetadata.py
new file mode 100644
index 0000000..c9fe7cd
--- /dev/null
+++ b/ethosu/vela/tflite/DimensionMetadata.py
@@ -0,0 +1,76 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DimensionMetadata(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDimensionMetadata(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DimensionMetadata()
+        x.Init(buf, n + offset)
+        return x
+
+    # DimensionMetadata
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DimensionMetadata
+    def Format(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def DenseSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArraySegmentsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArraySegments(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # DimensionMetadata
+    def ArrayIndicesType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # DimensionMetadata
+    def ArrayIndices(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+def DimensionMetadataStart(builder): builder.StartObject(6)
+def DimensionMetadataAddFormat(builder, format): builder.PrependInt8Slot(0, format, 0)
+def DimensionMetadataAddDenseSize(builder, denseSize): builder.PrependInt32Slot(1, denseSize, 0)
+def DimensionMetadataAddArraySegmentsType(builder, arraySegmentsType): builder.PrependUint8Slot(2, arraySegmentsType, 0)
+def DimensionMetadataAddArraySegments(builder, arraySegments): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(arraySegments), 0)
+def DimensionMetadataAddArrayIndicesType(builder, arrayIndicesType): builder.PrependUint8Slot(4, arrayIndicesType, 0)
+def DimensionMetadataAddArrayIndices(builder, arrayIndices): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(arrayIndices), 0)
+def DimensionMetadataEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DimensionType.py b/ethosu/vela/tflite/DimensionType.py
new file mode 100644
index 0000000..310d8ee
--- /dev/null
+++ b/ethosu/vela/tflite/DimensionType.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class DimensionType(object):
+    DENSE = 0
+    SPARSE_CSR = 1
diff --git a/ethosu/vela/tflite/DivOptions.py b/ethosu/vela/tflite/DivOptions.py
new file mode 100644
index 0000000..905a3be
--- /dev/null
+++ b/ethosu/vela/tflite/DivOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DivOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsDivOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = DivOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # DivOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # DivOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def DivOptionsStart(builder): builder.StartObject(1)
+def DivOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def DivOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py
new file mode 100644
index 0000000..7d9c144
--- /dev/null
+++ b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class EmbeddingLookupSparseOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsEmbeddingLookupSparseOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = EmbeddingLookupSparseOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # EmbeddingLookupSparseOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # EmbeddingLookupSparseOptions
+    def Combiner(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def EmbeddingLookupSparseOptionsStart(builder): builder.StartObject(1)
+def EmbeddingLookupSparseOptionsAddCombiner(builder, combiner): builder.PrependInt8Slot(0, combiner, 0)
+def EmbeddingLookupSparseOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/EqualOptions.py b/ethosu/vela/tflite/EqualOptions.py
new file mode 100644
index 0000000..f787ef8
--- /dev/null
+++ b/ethosu/vela/tflite/EqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class EqualOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsEqualOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = EqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # EqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def EqualOptionsStart(builder): builder.StartObject(0)
+def EqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ExpOptions.py b/ethosu/vela/tflite/ExpOptions.py
new file mode 100644
index 0000000..eac1456
--- /dev/null
+++ b/ethosu/vela/tflite/ExpOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ExpOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsExpOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ExpOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ExpOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def ExpOptionsStart(builder): builder.StartObject(0)
+def ExpOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ExpandDimsOptions.py b/ethosu/vela/tflite/ExpandDimsOptions.py
new file mode 100644
index 0000000..69d6366
--- /dev/null
+++ b/ethosu/vela/tflite/ExpandDimsOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ExpandDimsOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsExpandDimsOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ExpandDimsOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ExpandDimsOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def ExpandDimsOptionsStart(builder): builder.StartObject(0)
+def ExpandDimsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FakeQuantOptions.py b/ethosu/vela/tflite/FakeQuantOptions.py
new file mode 100644
index 0000000..46c371c
--- /dev/null
+++ b/ethosu/vela/tflite/FakeQuantOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FakeQuantOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsFakeQuantOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FakeQuantOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # FakeQuantOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # FakeQuantOptions
+    def Min(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # FakeQuantOptions
+    def Max(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # FakeQuantOptions
+    def NumBits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # FakeQuantOptions
+    def NarrowRange(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def FakeQuantOptionsStart(builder): builder.StartObject(4)
+def FakeQuantOptionsAddMin(builder, min): builder.PrependFloat32Slot(0, min, 0.0)
+def FakeQuantOptionsAddMax(builder, max): builder.PrependFloat32Slot(1, max, 0.0)
+def FakeQuantOptionsAddNumBits(builder, numBits): builder.PrependInt32Slot(2, numBits, 0)
+def FakeQuantOptionsAddNarrowRange(builder, narrowRange): builder.PrependBoolSlot(3, narrowRange, 0)
+def FakeQuantOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FillOptions.py b/ethosu/vela/tflite/FillOptions.py
new file mode 100644
index 0000000..5a1e651
--- /dev/null
+++ b/ethosu/vela/tflite/FillOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FillOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsFillOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FillOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # FillOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def FillOptionsStart(builder): builder.StartObject(0)
+def FillOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FloorDivOptions.py b/ethosu/vela/tflite/FloorDivOptions.py
new file mode 100644
index 0000000..64b474f
--- /dev/null
+++ b/ethosu/vela/tflite/FloorDivOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FloorDivOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsFloorDivOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FloorDivOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # FloorDivOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def FloorDivOptionsStart(builder): builder.StartObject(0)
+def FloorDivOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FloorModOptions.py b/ethosu/vela/tflite/FloorModOptions.py
new file mode 100644
index 0000000..37c8e5a
--- /dev/null
+++ b/ethosu/vela/tflite/FloorModOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FloorModOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsFloorModOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FloorModOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # FloorModOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def FloorModOptionsStart(builder): builder.StartObject(0)
+def FloorModOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FullyConnectedOptions.py b/ethosu/vela/tflite/FullyConnectedOptions.py
new file mode 100644
index 0000000..a6b4e40
--- /dev/null
+++ b/ethosu/vela/tflite/FullyConnectedOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FullyConnectedOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsFullyConnectedOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = FullyConnectedOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # FullyConnectedOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # FullyConnectedOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # FullyConnectedOptions
+    def WeightsFormat(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # FullyConnectedOptions
+    def KeepNumDims(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def FullyConnectedOptionsStart(builder): builder.StartObject(3)
+def FullyConnectedOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def FullyConnectedOptionsAddWeightsFormat(builder, weightsFormat): builder.PrependInt8Slot(1, weightsFormat, 0)
+def FullyConnectedOptionsAddKeepNumDims(builder, keepNumDims): builder.PrependBoolSlot(2, keepNumDims, 0)
+def FullyConnectedOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py
new file mode 100644
index 0000000..d9a5388
--- /dev/null
+++ b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class FullyConnectedOptionsWeightsFormat(object):
+    DEFAULT = 0
+    SHUFFLED4x16INT8 = 1
diff --git a/ethosu/vela/tflite/GatherNdOptions.py b/ethosu/vela/tflite/GatherNdOptions.py
new file mode 100644
index 0000000..f515eb5
--- /dev/null
+++ b/ethosu/vela/tflite/GatherNdOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GatherNdOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsGatherNdOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GatherNdOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # GatherNdOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def GatherNdOptionsStart(builder): builder.StartObject(0)
+def GatherNdOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GatherOptions.py b/ethosu/vela/tflite/GatherOptions.py
new file mode 100644
index 0000000..9fbc3e4
--- /dev/null
+++ b/ethosu/vela/tflite/GatherOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GatherOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsGatherOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GatherOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # GatherOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # GatherOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def GatherOptionsStart(builder): builder.StartObject(1)
+def GatherOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def GatherOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GreaterEqualOptions.py b/ethosu/vela/tflite/GreaterEqualOptions.py
new file mode 100644
index 0000000..a29e200
--- /dev/null
+++ b/ethosu/vela/tflite/GreaterEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GreaterEqualOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsGreaterEqualOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GreaterEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # GreaterEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def GreaterEqualOptionsStart(builder): builder.StartObject(0)
+def GreaterEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GreaterOptions.py b/ethosu/vela/tflite/GreaterOptions.py
new file mode 100644
index 0000000..59d6350
--- /dev/null
+++ b/ethosu/vela/tflite/GreaterOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GreaterOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsGreaterOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = GreaterOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # GreaterOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def GreaterOptionsStart(builder): builder.StartObject(0)
+def GreaterOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/HardSwishOptions.py b/ethosu/vela/tflite/HardSwishOptions.py
new file mode 100644
index 0000000..4f6a520
--- /dev/null
+++ b/ethosu/vela/tflite/HardSwishOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class HardSwishOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsHardSwishOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = HardSwishOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # HardSwishOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def HardSwishOptionsStart(builder): builder.StartObject(0)
+def HardSwishOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/IfOptions.py b/ethosu/vela/tflite/IfOptions.py
new file mode 100644
index 0000000..13f4e69
--- /dev/null
+++ b/ethosu/vela/tflite/IfOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class IfOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsIfOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = IfOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # IfOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # IfOptions
+    def ThenSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # IfOptions
+    def ElseSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def IfOptionsStart(builder): builder.StartObject(2)
+def IfOptionsAddThenSubgraphIndex(builder, thenSubgraphIndex): builder.PrependInt32Slot(0, thenSubgraphIndex, 0)
+def IfOptionsAddElseSubgraphIndex(builder, elseSubgraphIndex): builder.PrependInt32Slot(1, elseSubgraphIndex, 0)
+def IfOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Int32Vector.py b/ethosu/vela/tflite/Int32Vector.py
new file mode 100644
index 0000000..e70851b
--- /dev/null
+++ b/ethosu/vela/tflite/Int32Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Int32Vector(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsInt32Vector(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Int32Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    # Int32Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Int32Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Int32Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Int32Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def Int32VectorStart(builder): builder.StartObject(1)
+def Int32VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Int32VectorStartValuesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def Int32VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/L2NormOptions.py b/ethosu/vela/tflite/L2NormOptions.py
new file mode 100644
index 0000000..38bdf57
--- /dev/null
+++ b/ethosu/vela/tflite/L2NormOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class L2NormOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsL2NormOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = L2NormOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # L2NormOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # L2NormOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def L2NormOptionsStart(builder): builder.StartObject(1)
+def L2NormOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def L2NormOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LSHProjectionOptions.py b/ethosu/vela/tflite/LSHProjectionOptions.py
new file mode 100644
index 0000000..ad550be
--- /dev/null
+++ b/ethosu/vela/tflite/LSHProjectionOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LSHProjectionOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLSHProjectionOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LSHProjectionOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LSHProjectionOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LSHProjectionOptions
+    def Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def LSHProjectionOptionsStart(builder): builder.StartObject(1)
+def LSHProjectionOptionsAddType(builder, type): builder.PrependInt8Slot(0, type, 0)
+def LSHProjectionOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LSHProjectionType.py b/ethosu/vela/tflite/LSHProjectionType.py
new file mode 100644
index 0000000..a7d6a31
--- /dev/null
+++ b/ethosu/vela/tflite/LSHProjectionType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class LSHProjectionType(object):
+    UNKNOWN = 0
+    SPARSE = 1
+    DENSE = 2
diff --git a/ethosu/vela/tflite/LSTMKernelType.py b/ethosu/vela/tflite/LSTMKernelType.py
new file mode 100644
index 0000000..fd65799
--- /dev/null
+++ b/ethosu/vela/tflite/LSTMKernelType.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class LSTMKernelType(object):
+    FULL = 0
+    BASIC = 1
diff --git a/ethosu/vela/tflite/LSTMOptions.py b/ethosu/vela/tflite/LSTMOptions.py
new file mode 100644
index 0000000..93a8309
--- /dev/null
+++ b/ethosu/vela/tflite/LSTMOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LSTMOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLSTMOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # LSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # LSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # LSTMOptions
+    def KernelType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def LSTMOptionsStart(builder): builder.StartObject(4)
+def LSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def LSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def LSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def LSTMOptionsAddKernelType(builder, kernelType): builder.PrependInt8Slot(3, kernelType, 0)
+def LSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LeakyReluOptions.py b/ethosu/vela/tflite/LeakyReluOptions.py
new file mode 100644
index 0000000..b61b21d
--- /dev/null
+++ b/ethosu/vela/tflite/LeakyReluOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LeakyReluOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLeakyReluOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LeakyReluOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LeakyReluOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LeakyReluOptions
+    def Alpha(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+def LeakyReluOptionsStart(builder): builder.StartObject(1)
+def LeakyReluOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(0, alpha, 0.0)
+def LeakyReluOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LessEqualOptions.py b/ethosu/vela/tflite/LessEqualOptions.py
new file mode 100644
index 0000000..d49b728
--- /dev/null
+++ b/ethosu/vela/tflite/LessEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LessEqualOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLessEqualOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LessEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LessEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LessEqualOptionsStart(builder): builder.StartObject(0)
+def LessEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LessOptions.py b/ethosu/vela/tflite/LessOptions.py
new file mode 100644
index 0000000..469cb0b
--- /dev/null
+++ b/ethosu/vela/tflite/LessOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LessOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLessOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LessOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LessOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LessOptionsStart(builder): builder.StartObject(0)
+def LessOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LocalResponseNormalizationOptions.py b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py
new file mode 100644
index 0000000..db87560
--- /dev/null
+++ b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LocalResponseNormalizationOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLocalResponseNormalizationOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LocalResponseNormalizationOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LocalResponseNormalizationOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # LocalResponseNormalizationOptions
+    def Radius(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # LocalResponseNormalizationOptions
+    def Bias(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # LocalResponseNormalizationOptions
+    def Alpha(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # LocalResponseNormalizationOptions
+    def Beta(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+def LocalResponseNormalizationOptionsStart(builder): builder.StartObject(4)
+def LocalResponseNormalizationOptionsAddRadius(builder, radius): builder.PrependInt32Slot(0, radius, 0)
+def LocalResponseNormalizationOptionsAddBias(builder, bias): builder.PrependFloat32Slot(1, bias, 0.0)
+def LocalResponseNormalizationOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(2, alpha, 0.0)
+def LocalResponseNormalizationOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(3, beta, 0.0)
+def LocalResponseNormalizationOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogSoftmaxOptions.py b/ethosu/vela/tflite/LogSoftmaxOptions.py
new file mode 100644
index 0000000..4789385
--- /dev/null
+++ b/ethosu/vela/tflite/LogSoftmaxOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogSoftmaxOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLogSoftmaxOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogSoftmaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LogSoftmaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogSoftmaxOptionsStart(builder): builder.StartObject(0)
+def LogSoftmaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalAndOptions.py b/ethosu/vela/tflite/LogicalAndOptions.py
new file mode 100644
index 0000000..cee1cdb
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalAndOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalAndOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLogicalAndOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalAndOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LogicalAndOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalAndOptionsStart(builder): builder.StartObject(0)
+def LogicalAndOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalNotOptions.py b/ethosu/vela/tflite/LogicalNotOptions.py
new file mode 100644
index 0000000..9971450
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalNotOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalNotOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLogicalNotOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalNotOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LogicalNotOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalNotOptionsStart(builder): builder.StartObject(0)
+def LogicalNotOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalOrOptions.py b/ethosu/vela/tflite/LogicalOrOptions.py
new file mode 100644
index 0000000..e94a5de
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalOrOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalOrOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsLogicalOrOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = LogicalOrOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # LogicalOrOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalOrOptionsStart(builder): builder.StartObject(0)
+def LogicalOrOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MatrixDiagOptions.py b/ethosu/vela/tflite/MatrixDiagOptions.py
new file mode 100644
index 0000000..0f64e65
--- /dev/null
+++ b/ethosu/vela/tflite/MatrixDiagOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MatrixDiagOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMatrixDiagOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MatrixDiagOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # MatrixDiagOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def MatrixDiagOptionsStart(builder): builder.StartObject(0)
+def MatrixDiagOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MatrixSetDiagOptions.py b/ethosu/vela/tflite/MatrixSetDiagOptions.py
new file mode 100644
index 0000000..14178cf
--- /dev/null
+++ b/ethosu/vela/tflite/MatrixSetDiagOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MatrixSetDiagOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMatrixSetDiagOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MatrixSetDiagOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # MatrixSetDiagOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def MatrixSetDiagOptionsStart(builder): builder.StartObject(0)
+def MatrixSetDiagOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MaximumMinimumOptions.py b/ethosu/vela/tflite/MaximumMinimumOptions.py
new file mode 100644
index 0000000..f0806e2
--- /dev/null
+++ b/ethosu/vela/tflite/MaximumMinimumOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MaximumMinimumOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMaximumMinimumOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MaximumMinimumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # MaximumMinimumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def MaximumMinimumOptionsStart(builder): builder.StartObject(0)
+def MaximumMinimumOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Metadata.py b/ethosu/vela/tflite/Metadata.py
new file mode 100644
index 0000000..273e51e
--- /dev/null
+++ b/ethosu/vela/tflite/Metadata.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Metadata(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMetadata(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Metadata()
+        x.Init(buf, n + offset)
+        return x
+
+    # Metadata
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Metadata
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Metadata
+    def Buffer(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+def MetadataStart(builder): builder.StartObject(2)
+def MetadataAddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def MetadataAddBuffer(builder, buffer): builder.PrependUint32Slot(1, buffer, 0)
+def MetadataEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MirrorPadMode.py b/ethosu/vela/tflite/MirrorPadMode.py
new file mode 100644
index 0000000..8fb6396
--- /dev/null
+++ b/ethosu/vela/tflite/MirrorPadMode.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class MirrorPadMode(object):
+    REFLECT = 0
+    SYMMETRIC = 1
diff --git a/ethosu/vela/tflite/MirrorPadOptions.py b/ethosu/vela/tflite/MirrorPadOptions.py
new file mode 100644
index 0000000..254ae21
--- /dev/null
+++ b/ethosu/vela/tflite/MirrorPadOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MirrorPadOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMirrorPadOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MirrorPadOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # MirrorPadOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # MirrorPadOptions
+    def Mode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def MirrorPadOptionsStart(builder): builder.StartObject(1)
+def MirrorPadOptionsAddMode(builder, mode): builder.PrependInt8Slot(0, mode, 0)
+def MirrorPadOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Model.py b/ethosu/vela/tflite/Model.py
new file mode 100644
index 0000000..cc9991b
--- /dev/null
+++ b/ethosu/vela/tflite/Model.py
@@ -0,0 +1,150 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Model(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsModel(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Model()
+        x.Init(buf, n + offset)
+        return x
+
+    # Model
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Model
+    def Version(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # Model
+    def OperatorCodes(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .OperatorCode import OperatorCode
+            obj = OperatorCode()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def OperatorCodesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def Subgraphs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .SubGraph import SubGraph
+            obj = SubGraph()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def SubgraphsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def Description(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Model
+    def Buffers(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Buffer import Buffer
+            obj = Buffer()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def BuffersLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def MetadataBuffer(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Model
+    def MetadataBufferAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Model
+    def MetadataBufferLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Model
+    def Metadata(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Metadata import Metadata
+            obj = Metadata()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Model
+    def MetadataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def ModelStart(builder): builder.StartObject(7)
+def ModelAddVersion(builder, version): builder.PrependUint32Slot(0, version, 0)
+def ModelAddOperatorCodes(builder, operatorCodes): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(operatorCodes), 0)
+def ModelStartOperatorCodesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddSubgraphs(builder, subgraphs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(subgraphs), 0)
+def ModelStartSubgraphsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddDescription(builder, description): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(description), 0)
+def ModelAddBuffers(builder, buffers): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(buffers), 0)
+def ModelStartBuffersVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddMetadataBuffer(builder, metadataBuffer): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(metadataBuffer), 0)
+def ModelStartMetadataBufferVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddMetadata(builder, metadata): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(metadata), 0)
+def ModelStartMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MulOptions.py b/ethosu/vela/tflite/MulOptions.py
new file mode 100644
index 0000000..55b9506
--- /dev/null
+++ b/ethosu/vela/tflite/MulOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MulOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsMulOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = MulOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # MulOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # MulOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def MulOptionsStart(builder): builder.StartObject(1)
+def MulOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def MulOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NegOptions.py b/ethosu/vela/tflite/NegOptions.py
new file mode 100644
index 0000000..05d55c2
--- /dev/null
+++ b/ethosu/vela/tflite/NegOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NegOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsNegOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NegOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # NegOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def NegOptionsStart(builder): builder.StartObject(0)
+def NegOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NonMaxSuppressionV4Options.py b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py
new file mode 100644
index 0000000..6ad10a2
--- /dev/null
+++ b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NonMaxSuppressionV4Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsNonMaxSuppressionV4Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonMaxSuppressionV4Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # NonMaxSuppressionV4Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def NonMaxSuppressionV4OptionsStart(builder): builder.StartObject(0)
+def NonMaxSuppressionV4OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NonMaxSuppressionV5Options.py b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py
new file mode 100644
index 0000000..99cbdbb
--- /dev/null
+++ b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NonMaxSuppressionV5Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsNonMaxSuppressionV5Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonMaxSuppressionV5Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # NonMaxSuppressionV5Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def NonMaxSuppressionV5OptionsStart(builder): builder.StartObject(0)
+def NonMaxSuppressionV5OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NotEqualOptions.py b/ethosu/vela/tflite/NotEqualOptions.py
new file mode 100644
index 0000000..4c511e9
--- /dev/null
+++ b/ethosu/vela/tflite/NotEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NotEqualOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsNotEqualOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NotEqualOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # NotEqualOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def NotEqualOptionsStart(builder): builder.StartObject(0)
+def NotEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/OneHotOptions.py b/ethosu/vela/tflite/OneHotOptions.py
new file mode 100644
index 0000000..793a3e7
--- /dev/null
+++ b/ethosu/vela/tflite/OneHotOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class OneHotOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsOneHotOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = OneHotOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # OneHotOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # OneHotOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def OneHotOptionsStart(builder): builder.StartObject(1)
+def OneHotOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def OneHotOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Operator.py b/ethosu/vela/tflite/Operator.py
new file mode 100644
index 0000000..cbae3da
--- /dev/null
+++ b/ethosu/vela/tflite/Operator.py
@@ -0,0 +1,177 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Operator(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsOperator(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Operator()
+        x.Init(buf, n + offset)
+        return x
+
+    # Operator
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Operator
+    def OpcodeIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def Inputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Operator
+    def InputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def InputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def Outputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Operator
+    def OutputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def OutputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def BuiltinOptionsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def BuiltinOptions(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # Operator
+    def CustomOptions(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Operator
+    def CustomOptionsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Operator
+    def CustomOptionsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def CustomOptionsFormat(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Operator
+    def MutatingVariableInputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.BoolFlags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Operator
+    def MutatingVariableInputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o)
+        return 0
+
+    # Operator
+    def MutatingVariableInputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Operator
+    def Intermediates(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Operator
+    def IntermediatesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Operator
+    def IntermediatesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def OperatorStart(builder): builder.StartObject(9)
+def OperatorAddOpcodeIndex(builder, opcodeIndex): builder.PrependUint32Slot(0, opcodeIndex, 0)
+def OperatorAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0)
+def OperatorStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0)
+def OperatorStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorAddBuiltinOptionsType(builder, builtinOptionsType): builder.PrependUint8Slot(3, builtinOptionsType, 0)
+def OperatorAddBuiltinOptions(builder, builtinOptions): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(builtinOptions), 0)
+def OperatorAddCustomOptions(builder, customOptions): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(customOptions), 0)
+def OperatorStartCustomOptionsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def OperatorAddCustomOptionsFormat(builder, customOptionsFormat): builder.PrependInt8Slot(6, customOptionsFormat, 0)
+def OperatorAddMutatingVariableInputs(builder, mutatingVariableInputs): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(mutatingVariableInputs), 0)
+def OperatorStartMutatingVariableInputsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def OperatorAddIntermediates(builder, intermediates): builder.PrependUOffsetTRelativeSlot(8, flatbuffers.number_types.UOffsetTFlags.py_type(intermediates), 0)
+def OperatorStartIntermediatesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/OperatorCode.py b/ethosu/vela/tflite/OperatorCode.py
new file mode 100644
index 0000000..dd525f5
--- /dev/null
+++ b/ethosu/vela/tflite/OperatorCode.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class OperatorCode(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsOperatorCode(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = OperatorCode()
+        x.Init(buf, n + offset)
+        return x
+
+    # OperatorCode
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # OperatorCode
+    def BuiltinCode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # OperatorCode
+    def CustomCode(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # OperatorCode
+    def Version(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 1
+
+def OperatorCodeStart(builder): builder.StartObject(3)
+def OperatorCodeAddBuiltinCode(builder, builtinCode): builder.PrependInt8Slot(0, builtinCode, 0)
+def OperatorCodeAddCustomCode(builder, customCode): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(customCode), 0)
+def OperatorCodeAddVersion(builder, version): builder.PrependInt32Slot(2, version, 1)
+def OperatorCodeEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PackOptions.py b/ethosu/vela/tflite/PackOptions.py
new file mode 100644
index 0000000..6a8ee2b
--- /dev/null
+++ b/ethosu/vela/tflite/PackOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PackOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPackOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PackOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # PackOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # PackOptions
+    def ValuesCount(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # PackOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def PackOptionsStart(builder): builder.StartObject(2)
+def PackOptionsAddValuesCount(builder, valuesCount): builder.PrependInt32Slot(0, valuesCount, 0)
+def PackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0)
+def PackOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PadOptions.py b/ethosu/vela/tflite/PadOptions.py
new file mode 100644
index 0000000..d0833c6
--- /dev/null
+++ b/ethosu/vela/tflite/PadOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PadOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPadOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PadOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # PadOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def PadOptionsStart(builder): builder.StartObject(0)
+def PadOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PadV2Options.py b/ethosu/vela/tflite/PadV2Options.py
new file mode 100644
index 0000000..5ea0d70
--- /dev/null
+++ b/ethosu/vela/tflite/PadV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PadV2Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPadV2Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PadV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # PadV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def PadV2OptionsStart(builder): builder.StartObject(0)
+def PadV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Padding.py b/ethosu/vela/tflite/Padding.py
new file mode 100644
index 0000000..168bf74
--- /dev/null
+++ b/ethosu/vela/tflite/Padding.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class Padding(object):
+    SAME = 0
+    VALID = 1
diff --git a/ethosu/vela/tflite/Pool2DOptions.py b/ethosu/vela/tflite/Pool2DOptions.py
new file mode 100644
index 0000000..b8b9f17
--- /dev/null
+++ b/ethosu/vela/tflite/Pool2DOptions.py
@@ -0,0 +1,70 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Pool2DOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPool2DOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Pool2DOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # Pool2DOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Pool2DOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FilterWidth(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FilterHeight(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # Pool2DOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def Pool2DOptionsStart(builder): builder.StartObject(6)
+def Pool2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def Pool2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def Pool2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def Pool2DOptionsAddFilterWidth(builder, filterWidth): builder.PrependInt32Slot(3, filterWidth, 0)
+def Pool2DOptionsAddFilterHeight(builder, filterHeight): builder.PrependInt32Slot(4, filterHeight, 0)
+def Pool2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(5, fusedActivationFunction, 0)
+def Pool2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PowOptions.py b/ethosu/vela/tflite/PowOptions.py
new file mode 100644
index 0000000..666ca48
--- /dev/null
+++ b/ethosu/vela/tflite/PowOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PowOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsPowOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = PowOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # PowOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def PowOptionsStart(builder): builder.StartObject(0)
+def PowOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/QuantizationDetails.py b/ethosu/vela/tflite/QuantizationDetails.py
new file mode 100644
index 0000000..8d53af9
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizationDetails.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class QuantizationDetails(object):
+    NONE = 0
+    CustomQuantization = 1
diff --git a/ethosu/vela/tflite/QuantizationParameters.py b/ethosu/vela/tflite/QuantizationParameters.py
new file mode 100644
index 0000000..fcd686c
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizationParameters.py
@@ -0,0 +1,145 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class QuantizationParameters(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsQuantizationParameters(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = QuantizationParameters()
+        x.Init(buf, n + offset)
+        return x
+
+    # QuantizationParameters
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # QuantizationParameters
+    def Min(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # QuantizationParameters
+    def MinAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def MinLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def Max(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # QuantizationParameters
+    def MaxAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def MaxLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def Scale(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # QuantizationParameters
+    def ScaleAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def ScaleLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def ZeroPoint(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+        return 0
+
+    # QuantizationParameters
+    def ZeroPointAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+        return 0
+
+    # QuantizationParameters
+    def ZeroPointLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # QuantizationParameters
+    def DetailsType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+        return 0
+
+    # QuantizationParameters
+    def Details(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            from flatbuffers.table import Table
+            obj = Table(bytearray(), 0)
+            self._tab.Union(obj, o)
+            return obj
+        return None
+
+    # QuantizationParameters
+    def QuantizedDimension(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def QuantizationParametersStart(builder): builder.StartObject(7)
+def QuantizationParametersAddMin(builder, min): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(min), 0)
+def QuantizationParametersStartMinVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddMax(builder, max): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(max), 0)
+def QuantizationParametersStartMaxVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddScale(builder, scale): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(scale), 0)
+def QuantizationParametersStartScaleVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddZeroPoint(builder, zeroPoint): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(zeroPoint), 0)
+def QuantizationParametersStartZeroPointVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def QuantizationParametersAddDetailsType(builder, detailsType): builder.PrependUint8Slot(4, detailsType, 0)
+def QuantizationParametersAddDetails(builder, details): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(details), 0)
+def QuantizationParametersAddQuantizedDimension(builder, quantizedDimension): builder.PrependInt32Slot(6, quantizedDimension, 0)
+def QuantizationParametersEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/QuantizeOptions.py b/ethosu/vela/tflite/QuantizeOptions.py
new file mode 100644
index 0000000..28af8cc
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class QuantizeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsQuantizeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = QuantizeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # QuantizeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def QuantizeOptionsStart(builder): builder.StartObject(0)
+def QuantizeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RNNOptions.py b/ethosu/vela/tflite/RNNOptions.py
new file mode 100644
index 0000000..3cfdb6a
--- /dev/null
+++ b/ethosu/vela/tflite/RNNOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RNNOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsRNNOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # RNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # RNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def RNNOptionsStart(builder): builder.StartObject(1)
+def RNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def RNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RangeOptions.py b/ethosu/vela/tflite/RangeOptions.py
new file mode 100644
index 0000000..cb705b5
--- /dev/null
+++ b/ethosu/vela/tflite/RangeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RangeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsRangeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RangeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # RangeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def RangeOptionsStart(builder): builder.StartObject(0)
+def RangeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RankOptions.py b/ethosu/vela/tflite/RankOptions.py
new file mode 100644
index 0000000..4e4a5ec
--- /dev/null
+++ b/ethosu/vela/tflite/RankOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RankOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsRankOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = RankOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # RankOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def RankOptionsStart(builder): builder.StartObject(0)
+def RankOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReducerOptions.py b/ethosu/vela/tflite/ReducerOptions.py
new file mode 100644
index 0000000..93bbde1
--- /dev/null
+++ b/ethosu/vela/tflite/ReducerOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReducerOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsReducerOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReducerOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ReducerOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReducerOptions
+    def KeepDims(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def ReducerOptionsStart(builder): builder.StartObject(1)
+def ReducerOptionsAddKeepDims(builder, keepDims): builder.PrependBoolSlot(0, keepDims, 0)
+def ReducerOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReshapeOptions.py b/ethosu/vela/tflite/ReshapeOptions.py
new file mode 100644
index 0000000..157d45d
--- /dev/null
+++ b/ethosu/vela/tflite/ReshapeOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReshapeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsReshapeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReshapeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ReshapeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReshapeOptions
+    def NewShape(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # ReshapeOptions
+    def NewShapeAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # ReshapeOptions
+    def NewShapeLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def ReshapeOptionsStart(builder): builder.StartObject(1)
+def ReshapeOptionsAddNewShape(builder, newShape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(newShape), 0)
+def ReshapeOptionsStartNewShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ReshapeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ResizeBilinearOptions.py b/ethosu/vela/tflite/ResizeBilinearOptions.py
new file mode 100644
index 0000000..fb05ca4
--- /dev/null
+++ b/ethosu/vela/tflite/ResizeBilinearOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ResizeBilinearOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsResizeBilinearOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ResizeBilinearOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ResizeBilinearOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ResizeBilinearOptions
+    def AlignCorners(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # ResizeBilinearOptions
+    def HalfPixelCenters(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def ResizeBilinearOptionsStart(builder): builder.StartObject(4)
+def ResizeBilinearOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(2, alignCorners, 0)
+def ResizeBilinearOptionsAddHalfPixelCenters(builder, halfPixelCenters): builder.PrependBoolSlot(3, halfPixelCenters, 0)
+def ResizeBilinearOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ResizeNearestNeighborOptions.py b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py
new file mode 100644
index 0000000..4b166e9
--- /dev/null
+++ b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ResizeNearestNeighborOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsResizeNearestNeighborOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ResizeNearestNeighborOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ResizeNearestNeighborOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ResizeNearestNeighborOptions
+    def AlignCorners(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def ResizeNearestNeighborOptionsStart(builder): builder.StartObject(1)
+def ResizeNearestNeighborOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(0, alignCorners, 0)
+def ResizeNearestNeighborOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReverseSequenceOptions.py b/ethosu/vela/tflite/ReverseSequenceOptions.py
new file mode 100644
index 0000000..cbaf96d
--- /dev/null
+++ b/ethosu/vela/tflite/ReverseSequenceOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReverseSequenceOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsReverseSequenceOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReverseSequenceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ReverseSequenceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ReverseSequenceOptions
+    def SeqDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # ReverseSequenceOptions
+    def BatchDim(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def ReverseSequenceOptionsStart(builder): builder.StartObject(2)
+def ReverseSequenceOptionsAddSeqDim(builder, seqDim): builder.PrependInt32Slot(0, seqDim, 0)
+def ReverseSequenceOptionsAddBatchDim(builder, batchDim): builder.PrependInt32Slot(1, batchDim, 0)
+def ReverseSequenceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReverseV2Options.py b/ethosu/vela/tflite/ReverseV2Options.py
new file mode 100644
index 0000000..dbac936
--- /dev/null
+++ b/ethosu/vela/tflite/ReverseV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReverseV2Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsReverseV2Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ReverseV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # ReverseV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def ReverseV2OptionsStart(builder): builder.StartObject(0)
+def ReverseV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SVDFOptions.py b/ethosu/vela/tflite/SVDFOptions.py
new file mode 100644
index 0000000..6f391db
--- /dev/null
+++ b/ethosu/vela/tflite/SVDFOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SVDFOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSVDFOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SVDFOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SVDFOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SVDFOptions
+    def Rank(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SVDFOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def SVDFOptionsStart(builder): builder.StartObject(2)
+def SVDFOptionsAddRank(builder, rank): builder.PrependInt32Slot(0, rank, 0)
+def SVDFOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def SVDFOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ScatterNdOptions.py b/ethosu/vela/tflite/ScatterNdOptions.py
new file mode 100644
index 0000000..e6bf3a1
--- /dev/null
+++ b/ethosu/vela/tflite/ScatterNdOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ScatterNdOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsScatterNdOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ScatterNdOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ScatterNdOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def ScatterNdOptionsStart(builder): builder.StartObject(0)
+def ScatterNdOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SegmentSumOptions.py b/ethosu/vela/tflite/SegmentSumOptions.py
new file mode 100644
index 0000000..d1c3213
--- /dev/null
+++ b/ethosu/vela/tflite/SegmentSumOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SegmentSumOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSegmentSumOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SegmentSumOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SegmentSumOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SegmentSumOptionsStart(builder): builder.StartObject(0)
+def SegmentSumOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SelectOptions.py b/ethosu/vela/tflite/SelectOptions.py
new file mode 100644
index 0000000..d67daf3
--- /dev/null
+++ b/ethosu/vela/tflite/SelectOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SelectOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSelectOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SelectOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SelectOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SelectOptionsStart(builder): builder.StartObject(0)
+def SelectOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SelectV2Options.py b/ethosu/vela/tflite/SelectV2Options.py
new file mode 100644
index 0000000..5d03fc2
--- /dev/null
+++ b/ethosu/vela/tflite/SelectV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SelectV2Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSelectV2Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SelectV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # SelectV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SelectV2OptionsStart(builder): builder.StartObject(0)
+def SelectV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SequenceRNNOptions.py b/ethosu/vela/tflite/SequenceRNNOptions.py
new file mode 100644
index 0000000..74a4954
--- /dev/null
+++ b/ethosu/vela/tflite/SequenceRNNOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SequenceRNNOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSequenceRNNOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SequenceRNNOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SequenceRNNOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SequenceRNNOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # SequenceRNNOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def SequenceRNNOptionsStart(builder): builder.StartObject(2)
+def SequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0)
+def SequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def SequenceRNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ShapeOptions.py b/ethosu/vela/tflite/ShapeOptions.py
new file mode 100644
index 0000000..2d24c05
--- /dev/null
+++ b/ethosu/vela/tflite/ShapeOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ShapeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsShapeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ShapeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ShapeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # ShapeOptions
+    def OutType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def ShapeOptionsStart(builder): builder.StartObject(1)
+def ShapeOptionsAddOutType(builder, outType): builder.PrependInt8Slot(0, outType, 0)
+def ShapeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SkipGramOptions.py b/ethosu/vela/tflite/SkipGramOptions.py
new file mode 100644
index 0000000..0e8bdc1
--- /dev/null
+++ b/ethosu/vela/tflite/SkipGramOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SkipGramOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSkipGramOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SkipGramOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SkipGramOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SkipGramOptions
+    def NgramSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SkipGramOptions
+    def MaxSkipSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # SkipGramOptions
+    def IncludeAllNgrams(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def SkipGramOptionsStart(builder): builder.StartObject(3)
+def SkipGramOptionsAddNgramSize(builder, ngramSize): builder.PrependInt32Slot(0, ngramSize, 0)
+def SkipGramOptionsAddMaxSkipSize(builder, maxSkipSize): builder.PrependInt32Slot(1, maxSkipSize, 0)
+def SkipGramOptionsAddIncludeAllNgrams(builder, includeAllNgrams): builder.PrependBoolSlot(2, includeAllNgrams, 0)
+def SkipGramOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SliceOptions.py b/ethosu/vela/tflite/SliceOptions.py
new file mode 100644
index 0000000..4b41568
--- /dev/null
+++ b/ethosu/vela/tflite/SliceOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SliceOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSliceOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SliceOptionsStart(builder): builder.StartObject(0)
+def SliceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SoftmaxOptions.py b/ethosu/vela/tflite/SoftmaxOptions.py
new file mode 100644
index 0000000..a716853
--- /dev/null
+++ b/ethosu/vela/tflite/SoftmaxOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SoftmaxOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSoftmaxOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SoftmaxOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SoftmaxOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SoftmaxOptions
+    def Beta(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+def SoftmaxOptionsStart(builder): builder.StartObject(1)
+def SoftmaxOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(0, beta, 0.0)
+def SoftmaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SpaceToBatchNDOptions.py b/ethosu/vela/tflite/SpaceToBatchNDOptions.py
new file mode 100644
index 0000000..b61ef96
--- /dev/null
+++ b/ethosu/vela/tflite/SpaceToBatchNDOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SpaceToBatchNDOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSpaceToBatchNDOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SpaceToBatchNDOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SpaceToBatchNDOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SpaceToBatchNDOptionsStart(builder): builder.StartObject(0)
+def SpaceToBatchNDOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SpaceToDepthOptions.py b/ethosu/vela/tflite/SpaceToDepthOptions.py
new file mode 100644
index 0000000..d571174
--- /dev/null
+++ b/ethosu/vela/tflite/SpaceToDepthOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SpaceToDepthOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSpaceToDepthOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SpaceToDepthOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SpaceToDepthOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SpaceToDepthOptions
+    def BlockSize(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def SpaceToDepthOptionsStart(builder): builder.StartObject(1)
+def SpaceToDepthOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0)
+def SpaceToDepthOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SparseIndexVector.py b/ethosu/vela/tflite/SparseIndexVector.py
new file mode 100644
index 0000000..e2c9db7
--- /dev/null
+++ b/ethosu/vela/tflite/SparseIndexVector.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class SparseIndexVector(object):
+    NONE = 0
+    Int32Vector = 1
+    Uint16Vector = 2
+    Uint8Vector = 3
diff --git a/ethosu/vela/tflite/SparseToDenseOptions.py b/ethosu/vela/tflite/SparseToDenseOptions.py
new file mode 100644
index 0000000..826eee0
--- /dev/null
+++ b/ethosu/vela/tflite/SparseToDenseOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SparseToDenseOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSparseToDenseOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SparseToDenseOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SparseToDenseOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SparseToDenseOptions
+    def ValidateIndices(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def SparseToDenseOptionsStart(builder): builder.StartObject(1)
+def SparseToDenseOptionsAddValidateIndices(builder, validateIndices): builder.PrependBoolSlot(0, validateIndices, 0)
+def SparseToDenseOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SparsityParameters.py b/ethosu/vela/tflite/SparsityParameters.py
new file mode 100644
index 0000000..de550a6
--- /dev/null
+++ b/ethosu/vela/tflite/SparsityParameters.py
@@ -0,0 +1,92 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SparsityParameters(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSparsityParameters(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SparsityParameters()
+        x.Init(buf, n + offset)
+        return x
+
+    # SparsityParameters
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SparsityParameters
+    def TraversalOrder(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # SparsityParameters
+    def TraversalOrderAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SparsityParameters
+    def TraversalOrderLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SparsityParameters
+    def BlockMap(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # SparsityParameters
+    def BlockMapAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SparsityParameters
+    def BlockMapLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SparsityParameters
+    def DimMetadata(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .DimensionMetadata import DimensionMetadata
+            obj = DimensionMetadata()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SparsityParameters
+    def DimMetadataLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def SparsityParametersStart(builder): builder.StartObject(3)
+def SparsityParametersAddTraversalOrder(builder, traversalOrder): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(traversalOrder), 0)
+def SparsityParametersStartTraversalOrderVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersAddBlockMap(builder, blockMap): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(blockMap), 0)
+def SparsityParametersStartBlockMapVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersAddDimMetadata(builder, dimMetadata): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(dimMetadata), 0)
+def SparsityParametersStartDimMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SplitOptions.py b/ethosu/vela/tflite/SplitOptions.py
new file mode 100644
index 0000000..3207525
--- /dev/null
+++ b/ethosu/vela/tflite/SplitOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SplitOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSplitOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SplitOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SplitOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SplitOptions
+    def NumSplits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def SplitOptionsStart(builder): builder.StartObject(1)
+def SplitOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0)
+def SplitOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SplitVOptions.py b/ethosu/vela/tflite/SplitVOptions.py
new file mode 100644
index 0000000..418959d
--- /dev/null
+++ b/ethosu/vela/tflite/SplitVOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SplitVOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSplitVOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SplitVOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SplitVOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SplitVOptions
+    def NumSplits(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def SplitVOptionsStart(builder): builder.StartObject(1)
+def SplitVOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0)
+def SplitVOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SquareOptions.py b/ethosu/vela/tflite/SquareOptions.py
new file mode 100644
index 0000000..56633f6
--- /dev/null
+++ b/ethosu/vela/tflite/SquareOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SquareOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSquareOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SquareOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SquareOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SquareOptionsStart(builder): builder.StartObject(0)
+def SquareOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SquaredDifferenceOptions.py b/ethosu/vela/tflite/SquaredDifferenceOptions.py
new file mode 100644
index 0000000..906855d
--- /dev/null
+++ b/ethosu/vela/tflite/SquaredDifferenceOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SquaredDifferenceOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSquaredDifferenceOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SquaredDifferenceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SquaredDifferenceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def SquaredDifferenceOptionsStart(builder): builder.StartObject(0)
+def SquaredDifferenceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SqueezeOptions.py b/ethosu/vela/tflite/SqueezeOptions.py
new file mode 100644
index 0000000..25b294d
--- /dev/null
+++ b/ethosu/vela/tflite/SqueezeOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SqueezeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSqueezeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SqueezeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SqueezeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SqueezeOptions
+    def SqueezeDims(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # SqueezeOptions
+    def SqueezeDimsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SqueezeOptions
+    def SqueezeDimsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def SqueezeOptionsStart(builder): builder.StartObject(1)
+def SqueezeOptionsAddSqueezeDims(builder, squeezeDims): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(squeezeDims), 0)
+def SqueezeOptionsStartSqueezeDimsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SqueezeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/StridedSliceOptions.py b/ethosu/vela/tflite/StridedSliceOptions.py
new file mode 100644
index 0000000..3bbb36b
--- /dev/null
+++ b/ethosu/vela/tflite/StridedSliceOptions.py
@@ -0,0 +1,62 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class StridedSliceOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsStridedSliceOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = StridedSliceOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # StridedSliceOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # StridedSliceOptions
+    def BeginMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def EndMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def EllipsisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def NewAxisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # StridedSliceOptions
+    def ShrinkAxisMask(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def StridedSliceOptionsStart(builder): builder.StartObject(5)
+def StridedSliceOptionsAddBeginMask(builder, beginMask): builder.PrependInt32Slot(0, beginMask, 0)
+def StridedSliceOptionsAddEndMask(builder, endMask): builder.PrependInt32Slot(1, endMask, 0)
+def StridedSliceOptionsAddEllipsisMask(builder, ellipsisMask): builder.PrependInt32Slot(2, ellipsisMask, 0)
+def StridedSliceOptionsAddNewAxisMask(builder, newAxisMask): builder.PrependInt32Slot(3, newAxisMask, 0)
+def StridedSliceOptionsAddShrinkAxisMask(builder, shrinkAxisMask): builder.PrependInt32Slot(4, shrinkAxisMask, 0)
+def StridedSliceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SubGraph.py b/ethosu/vela/tflite/SubGraph.py
new file mode 100644
index 0000000..eaa42fa
--- /dev/null
+++ b/ethosu/vela/tflite/SubGraph.py
@@ -0,0 +1,122 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SubGraph(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSubGraph(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SubGraph()
+        x.Init(buf, n + offset)
+        return x
+
+    # SubGraph
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SubGraph
+    def Tensors(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Tensor import Tensor
+            obj = Tensor()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SubGraph
+    def TensorsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def Inputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # SubGraph
+    def InputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SubGraph
+    def InputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def Outputs(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # SubGraph
+    def OutputsAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # SubGraph
+    def OutputsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def Operators(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from .Operator import Operator
+            obj = Operator()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # SubGraph
+    def OperatorsLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # SubGraph
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+def SubGraphStart(builder): builder.StartObject(5)
+def SubGraphAddTensors(builder, tensors): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(tensors), 0)
+def SubGraphStartTensorsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0)
+def SubGraphStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0)
+def SubGraphStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddOperators(builder, operators): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(operators), 0)
+def SubGraphStartOperatorsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddName(builder, name): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def SubGraphEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SubOptions.py b/ethosu/vela/tflite/SubOptions.py
new file mode 100644
index 0000000..eccd7ab
--- /dev/null
+++ b/ethosu/vela/tflite/SubOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SubOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsSubOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = SubOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # SubOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # SubOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def SubOptionsStart(builder): builder.StartObject(1)
+def SubOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def SubOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Tensor.py b/ethosu/vela/tflite/Tensor.py
new file mode 100644
index 0000000..4c39b7c
--- /dev/null
+++ b/ethosu/vela/tflite/Tensor.py
@@ -0,0 +1,126 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Tensor(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsTensor(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Tensor()
+        x.Init(buf, n + offset)
+        return x
+
+    # Tensor
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Tensor
+    def Shape(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Tensor
+    def ShapeAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Tensor
+    def ShapeLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # Tensor
+    def Type(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # Tensor
+    def Buffer(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+        return 0
+
+    # Tensor
+    def Name(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # Tensor
+    def Quantization(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .QuantizationParameters import QuantizationParameters
+            obj = QuantizationParameters()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Tensor
+    def IsVariable(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+    # Tensor
+    def Sparsity(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+        if o != 0:
+            x = self._tab.Indirect(o + self._tab.Pos)
+            from .SparsityParameters import SparsityParameters
+            obj = SparsityParameters()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # Tensor
+    def ShapeSignature(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+        return 0
+
+    # Tensor
+    def ShapeSignatureAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+        return 0
+
+    # Tensor
+    def ShapeSignatureLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def TensorStart(builder): builder.StartObject(8)
+def TensorAddShape(builder, shape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(shape), 0)
+def TensorStartShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def TensorAddType(builder, type): builder.PrependInt8Slot(1, type, 0)
+def TensorAddBuffer(builder, buffer): builder.PrependUint32Slot(2, buffer, 0)
+def TensorAddName(builder, name): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def TensorAddQuantization(builder, quantization): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(quantization), 0)
+def TensorAddIsVariable(builder, isVariable): builder.PrependBoolSlot(5, isVariable, 0)
+def TensorAddSparsity(builder, sparsity): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(sparsity), 0)
+def TensorAddShapeSignature(builder, shapeSignature): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(shapeSignature), 0)
+def TensorStartShapeSignatureVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def TensorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TensorType.py b/ethosu/vela/tflite/TensorType.py
new file mode 100644
index 0000000..53c011b
--- /dev/null
+++ b/ethosu/vela/tflite/TensorType.py
@@ -0,0 +1,15 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class TensorType(object):
+    FLOAT32 = 0
+    FLOAT16 = 1
+    INT32 = 2
+    UINT8 = 3
+    INT64 = 4
+    STRING = 5
+    BOOL = 6
+    INT16 = 7
+    COMPLEX64 = 8
+    INT8 = 9
diff --git a/ethosu/vela/tflite/TileOptions.py b/ethosu/vela/tflite/TileOptions.py
new file mode 100644
index 0000000..ec8396d
--- /dev/null
+++ b/ethosu/vela/tflite/TileOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TileOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsTileOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TileOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # TileOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def TileOptionsStart(builder): builder.StartObject(0)
+def TileOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TopKV2Options.py b/ethosu/vela/tflite/TopKV2Options.py
new file mode 100644
index 0000000..ccd5103
--- /dev/null
+++ b/ethosu/vela/tflite/TopKV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TopKV2Options(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsTopKV2Options(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TopKV2Options()
+        x.Init(buf, n + offset)
+        return x
+
+    # TopKV2Options
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def TopKV2OptionsStart(builder): builder.StartObject(0)
+def TopKV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TransposeConvOptions.py b/ethosu/vela/tflite/TransposeConvOptions.py
new file mode 100644
index 0000000..423571c
--- /dev/null
+++ b/ethosu/vela/tflite/TransposeConvOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TransposeConvOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsTransposeConvOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TransposeConvOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # TransposeConvOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TransposeConvOptions
+    def Padding(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def StrideW(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # TransposeConvOptions
+    def StrideH(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def TransposeConvOptionsStart(builder): builder.StartObject(3)
+def TransposeConvOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def TransposeConvOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def TransposeConvOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def TransposeConvOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TransposeOptions.py b/ethosu/vela/tflite/TransposeOptions.py
new file mode 100644
index 0000000..42c596d
--- /dev/null
+++ b/ethosu/vela/tflite/TransposeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TransposeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsTransposeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TransposeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # TransposeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def TransposeOptionsStart(builder): builder.StartObject(0)
+def TransposeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Uint16Vector.py b/ethosu/vela/tflite/Uint16Vector.py
new file mode 100644
index 0000000..750e52a
--- /dev/null
+++ b/ethosu/vela/tflite/Uint16Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Uint16Vector(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsUint16Vector(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Uint16Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    # Uint16Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Uint16Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint16Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 2))
+        return 0
+
+    # Uint16Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint16Flags, o)
+        return 0
+
+    # Uint16Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def Uint16VectorStart(builder): builder.StartObject(1)
+def Uint16VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Uint16VectorStartValuesVector(builder, numElems): return builder.StartVector(2, numElems, 2)
+def Uint16VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Uint8Vector.py b/ethosu/vela/tflite/Uint8Vector.py
new file mode 100644
index 0000000..dc475f9
--- /dev/null
+++ b/ethosu/vela/tflite/Uint8Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Uint8Vector(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsUint8Vector(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = Uint8Vector()
+        x.Init(buf, n + offset)
+        return x
+
+    # Uint8Vector
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # Uint8Vector
+    def Values(self, j):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            a = self._tab.Vector(o)
+            return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+        return 0
+
+    # Uint8Vector
+    def ValuesAsNumpy(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+        return 0
+
+    # Uint8Vector
+    def ValuesLength(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+def Uint8VectorStart(builder): builder.StartObject(1)
+def Uint8VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Uint8VectorStartValuesVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def Uint8VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py
new file mode 100644
index 0000000..1b0c112
--- /dev/null
+++ b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UnidirectionalSequenceLSTMOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsUnidirectionalSequenceLSTMOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnidirectionalSequenceLSTMOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # UnidirectionalSequenceLSTMOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UnidirectionalSequenceLSTMOptions
+    def FusedActivationFunction(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # UnidirectionalSequenceLSTMOptions
+    def CellClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # UnidirectionalSequenceLSTMOptions
+    def ProjClip(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+        return 0.0
+
+    # UnidirectionalSequenceLSTMOptions
+    def TimeMajor(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def UnidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(4)
+def UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def UnidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def UnidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def UnidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(3, timeMajor, 0)
+def UnidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UniqueOptions.py b/ethosu/vela/tflite/UniqueOptions.py
new file mode 100644
index 0000000..841c697
--- /dev/null
+++ b/ethosu/vela/tflite/UniqueOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UniqueOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsUniqueOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UniqueOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # UniqueOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UniqueOptions
+    def IdxOutType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 2
+
+def UniqueOptionsStart(builder): builder.StartObject(1)
+def UniqueOptionsAddIdxOutType(builder, idxOutType): builder.PrependInt8Slot(0, idxOutType, 2)
+def UniqueOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UnpackOptions.py b/ethosu/vela/tflite/UnpackOptions.py
new file mode 100644
index 0000000..eed4019
--- /dev/null
+++ b/ethosu/vela/tflite/UnpackOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UnpackOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsUnpackOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = UnpackOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # UnpackOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # UnpackOptions
+    def Num(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # UnpackOptions
+    def Axis(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def UnpackOptionsStart(builder): builder.StartObject(2)
+def UnpackOptionsAddNum(builder, num): builder.PrependInt32Slot(0, num, 0)
+def UnpackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0)
+def UnpackOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/WhereOptions.py b/ethosu/vela/tflite/WhereOptions.py
new file mode 100644
index 0000000..ab69f6a
--- /dev/null
+++ b/ethosu/vela/tflite/WhereOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class WhereOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsWhereOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = WhereOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # WhereOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def WhereOptionsStart(builder): builder.StartObject(0)
+def WhereOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/WhileOptions.py b/ethosu/vela/tflite/WhileOptions.py
new file mode 100644
index 0000000..7d5a6df
--- /dev/null
+++ b/ethosu/vela/tflite/WhileOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class WhileOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsWhileOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = WhileOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # WhileOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # WhileOptions
+    def CondSubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # WhileOptions
+    def BodySubgraphIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+def WhileOptionsStart(builder): builder.StartObject(2)
+def WhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex): builder.PrependInt32Slot(0, condSubgraphIndex, 0)
+def WhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex): builder.PrependInt32Slot(1, bodySubgraphIndex, 0)
+def WhileOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ZerosLikeOptions.py b/ethosu/vela/tflite/ZerosLikeOptions.py
new file mode 100644
index 0000000..e6aa963
--- /dev/null
+++ b/ethosu/vela/tflite/ZerosLikeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ZerosLikeOptions(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAsZerosLikeOptions(cls, buf, offset):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = ZerosLikeOptions()
+        x.Init(buf, n + offset)
+        return x
+
+    # ZerosLikeOptions
+    def Init(self, buf, pos):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+def ZerosLikeOptionsStart(builder): builder.StartObject(0)
+def ZerosLikeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/__init__.py b/ethosu/vela/tflite/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/ethosu/vela/tflite/__init__.py
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
new file mode 100644
index 0000000..8e46ef2
--- /dev/null
+++ b/ethosu/vela/tflite_mapping.py
@@ -0,0 +1,644 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# TensorFlow Lite mapping functions used by both reader and writer.
+# Contains a mapping from the various TensorFlow Lite enums and options structs, generated by the FlatBuffer code
+# generator, to Vela's internal format.
+
+import numpy as np
+import struct
+
+from .data_type import DataType
+
+from .tflite.TensorType import TensorType
+from .tflite.BuiltinOperator import BuiltinOperator
+from .tflite.BuiltinOptions import BuiltinOptions
+
+
+from .tflite.Padding import Padding
+from .tflite.ActivationFunctionType import ActivationFunctionType
+
+from .tflite import Conv2DOptions
+from .tflite import DepthwiseConv2DOptions
+from .tflite import ConcatEmbeddingsOptions
+from .tflite import LSHProjectionOptions
+from .tflite import Pool2DOptions
+from .tflite import SVDFOptions
+from .tflite import RNNOptions
+from .tflite import FullyConnectedOptions
+from .tflite import SoftmaxOptions
+from .tflite import ConcatenationOptions
+from .tflite import AddOptions
+from .tflite import L2NormOptions
+from .tflite import LocalResponseNormalizationOptions
+from .tflite import LSTMOptions
+from .tflite import ResizeBilinearOptions
+from .tflite import CallOptions
+from .tflite import ReshapeOptions
+from .tflite import SkipGramOptions
+from .tflite import SpaceToDepthOptions
+from .tflite import EmbeddingLookupSparseOptions
+from .tflite import MulOptions
+from .tflite import PadOptions
+from .tflite import GatherOptions
+from .tflite import BatchToSpaceNDOptions
+from .tflite import SpaceToBatchNDOptions
+from .tflite import TransposeOptions
+from .tflite import ReducerOptions
+from .tflite import SubOptions
+from .tflite import DivOptions
+from .tflite import SqueezeOptions
+from .tflite import SequenceRNNOptions
+from .tflite import StridedSliceOptions
+from .tflite import ExpOptions
+from .tflite import TopKV2Options
+from .tflite import SplitOptions
+from .tflite import LogSoftmaxOptions
+from .tflite import CastOptions
+from .tflite import DequantizeOptions
+from .tflite import MaximumMinimumOptions
+from .tflite import ArgMaxOptions
+from .tflite import LessOptions
+from .tflite import NegOptions
+from .tflite import PadV2Options
+from .tflite import GreaterOptions
+from .tflite import GreaterEqualOptions
+from .tflite import LessEqualOptions
+from .tflite import SelectOptions
+from .tflite import SliceOptions
+from .tflite import TransposeConvOptions
+from .tflite import SparseToDenseOptions
+from .tflite import TileOptions
+from .tflite import ExpandDimsOptions
+from .tflite import EqualOptions
+from .tflite import NotEqualOptions
+from .tflite import ShapeOptions
+from .tflite import PowOptions
+from .tflite import ArgMinOptions
+from .tflite import FakeQuantOptions
+from .tflite import PackOptions
+from .tflite import LogicalOrOptions
+from .tflite import OneHotOptions
+from .tflite import LogicalAndOptions
+from .tflite import LogicalNotOptions
+from .tflite import UnpackOptions
+from .tflite import FloorDivOptions
+from .tflite import SquareOptions
+from .tflite import ZerosLikeOptions
+from .tflite import FillOptions
+from .tflite import BidirectionalSequenceLSTMOptions
+from .tflite import BidirectionalSequenceRNNOptions
+from .tflite import UnidirectionalSequenceLSTMOptions
+from .tflite import FloorModOptions
+from .tflite import RangeOptions
+from .tflite import ResizeNearestNeighborOptions
+from .tflite import LeakyReluOptions
+from .tflite import SquaredDifferenceOptions
+from .tflite import MirrorPadOptions
+from .tflite import AbsOptions
+from .tflite import SplitVOptions
+from .tflite import UniqueOptions
+from .tflite import ReverseV2Options
+from .tflite import AddNOptions
+from .tflite import GatherNdOptions
+from .tflite import CosOptions
+from .tflite import WhereOptions
+from .tflite import RankOptions
+from .tflite import ReverseSequenceOptions
+from .tflite import MatrixDiagOptions
+from .tflite import QuantizeOptions
+from .tflite import MatrixSetDiagOptions
+from .tflite import DensifyOptions
+from .tflite import DepthToSpaceOptions
+from .tflite import IfOptions
+from .tflite import NonMaxSuppressionV4Options
+from .tflite import NonMaxSuppressionV5Options
+from .tflite import ScatterNdOptions
+from .tflite import SegmentSumOptions
+from .tflite import SelectV2Options
+from .tflite import WhileOptions
+
+
+def inverse_map(map):
+    return {v: k for k, v in map.items()}
+
+
+datatype_map = {
+    TensorType.UINT8: DataType.uint8,
+    TensorType.INT8: DataType.int8,
+    TensorType.INT16: DataType.int16,
+    TensorType.INT32: DataType.int32,
+    TensorType.INT64: DataType.int64,
+    TensorType.FLOAT16: DataType.float16,
+    TensorType.FLOAT32: DataType.float32,
+    TensorType.STRING: DataType.string,
+    TensorType.BOOL: DataType.bool,
+    # no TensorType.COMPLEX64 for now
+}
+
+datatype_inv_map = inverse_map(datatype_map)
+datatype_inv_map[DataType.quint8] = TensorType.UINT8
+
+datatype_inv_map[DataType.qint8] = TensorType.INT8
+datatype_inv_map[DataType.qint16] = TensorType.INT16
+datatype_inv_map[DataType.qint32] = TensorType.INT32
+
+
+datatype_map_numpy = {
+    TensorType.UINT8: np.uint8,
+    TensorType.INT8: np.int8,
+    TensorType.INT16: np.int16,
+    TensorType.INT32: np.int32,
+    TensorType.INT64: np.int64,
+    TensorType.FLOAT16: np.float16,
+    TensorType.FLOAT32: np.float32,
+    TensorType.BOOL: np.bool,
+}
+
+
+builtin_options_map = {
+    BuiltinOptions.Conv2DOptions: Conv2DOptions.Conv2DOptions,
+    BuiltinOptions.DepthwiseConv2DOptions: DepthwiseConv2DOptions.DepthwiseConv2DOptions,
+    BuiltinOptions.ConcatEmbeddingsOptions: ConcatEmbeddingsOptions.ConcatEmbeddingsOptions,
+    BuiltinOptions.LSHProjectionOptions: LSHProjectionOptions.LSHProjectionOptions,
+    BuiltinOptions.Pool2DOptions: Pool2DOptions.Pool2DOptions,
+    BuiltinOptions.SVDFOptions: SVDFOptions.SVDFOptions,
+    BuiltinOptions.RNNOptions: RNNOptions.RNNOptions,
+    BuiltinOptions.FullyConnectedOptions: FullyConnectedOptions.FullyConnectedOptions,
+    BuiltinOptions.SoftmaxOptions: SoftmaxOptions.SoftmaxOptions,
+    BuiltinOptions.ConcatenationOptions: ConcatenationOptions.ConcatenationOptions,
+    BuiltinOptions.AddOptions: AddOptions.AddOptions,
+    BuiltinOptions.L2NormOptions: L2NormOptions.L2NormOptions,
+    BuiltinOptions.LocalResponseNormalizationOptions: LocalResponseNormalizationOptions.LocalResponseNormalizationOptions,  # noqa: E501
+    BuiltinOptions.LSTMOptions: LSTMOptions.LSTMOptions,
+    BuiltinOptions.ResizeBilinearOptions: ResizeBilinearOptions.ResizeBilinearOptions,
+    BuiltinOptions.CallOptions: CallOptions.CallOptions,
+    BuiltinOptions.ReshapeOptions: ReshapeOptions.ReshapeOptions,
+    BuiltinOptions.SkipGramOptions: SkipGramOptions.SkipGramOptions,
+    BuiltinOptions.SpaceToDepthOptions: SpaceToDepthOptions.SpaceToDepthOptions,
+    BuiltinOptions.EmbeddingLookupSparseOptions: EmbeddingLookupSparseOptions.EmbeddingLookupSparseOptions,
+    BuiltinOptions.MulOptions: MulOptions.MulOptions,
+    BuiltinOptions.PadOptions: PadOptions.PadOptions,
+    BuiltinOptions.GatherOptions: GatherOptions.GatherOptions,
+    BuiltinOptions.BatchToSpaceNDOptions: BatchToSpaceNDOptions.BatchToSpaceNDOptions,
+    BuiltinOptions.SpaceToBatchNDOptions: SpaceToBatchNDOptions.SpaceToBatchNDOptions,
+    BuiltinOptions.TransposeOptions: TransposeOptions.TransposeOptions,
+    BuiltinOptions.ReducerOptions: ReducerOptions.ReducerOptions,
+    BuiltinOptions.SubOptions: SubOptions.SubOptions,
+    BuiltinOptions.DivOptions: DivOptions.DivOptions,
+    BuiltinOptions.SqueezeOptions: SqueezeOptions.SqueezeOptions,
+    BuiltinOptions.SequenceRNNOptions: SequenceRNNOptions.SequenceRNNOptions,
+    BuiltinOptions.StridedSliceOptions: StridedSliceOptions.StridedSliceOptions,
+    BuiltinOptions.ExpOptions: ExpOptions.ExpOptions,
+    BuiltinOptions.TopKV2Options: TopKV2Options.TopKV2Options,
+    BuiltinOptions.SplitOptions: SplitOptions.SplitOptions,
+    BuiltinOptions.LogSoftmaxOptions: LogSoftmaxOptions.LogSoftmaxOptions,
+    BuiltinOptions.CastOptions: CastOptions.CastOptions,
+    BuiltinOptions.DequantizeOptions: DequantizeOptions.DequantizeOptions,
+    BuiltinOptions.MaximumMinimumOptions: MaximumMinimumOptions.MaximumMinimumOptions,
+    BuiltinOptions.ArgMaxOptions: ArgMaxOptions.ArgMaxOptions,
+    BuiltinOptions.LessOptions: LessOptions.LessOptions,
+    BuiltinOptions.NegOptions: NegOptions.NegOptions,
+    BuiltinOptions.PadV2Options: PadV2Options.PadV2Options,
+    BuiltinOptions.GreaterOptions: GreaterOptions.GreaterOptions,
+    BuiltinOptions.GreaterEqualOptions: GreaterEqualOptions.GreaterEqualOptions,
+    BuiltinOptions.LessEqualOptions: LessEqualOptions.LessEqualOptions,
+    BuiltinOptions.SelectOptions: SelectOptions.SelectOptions,
+    BuiltinOptions.SliceOptions: SliceOptions.SliceOptions,
+    BuiltinOptions.TransposeConvOptions: TransposeConvOptions.TransposeConvOptions,
+    BuiltinOptions.SparseToDenseOptions: SparseToDenseOptions.SparseToDenseOptions,
+    BuiltinOptions.TileOptions: TileOptions.TileOptions,
+    BuiltinOptions.ExpandDimsOptions: ExpandDimsOptions.ExpandDimsOptions,
+    BuiltinOptions.EqualOptions: EqualOptions.EqualOptions,
+    BuiltinOptions.NotEqualOptions: NotEqualOptions.NotEqualOptions,
+    BuiltinOptions.ShapeOptions: ShapeOptions.ShapeOptions,
+    BuiltinOptions.PowOptions: PowOptions.PowOptions,
+    BuiltinOptions.ArgMinOptions: ArgMinOptions.ArgMinOptions,
+    BuiltinOptions.FakeQuantOptions: FakeQuantOptions.FakeQuantOptions,
+    BuiltinOptions.PackOptions: PackOptions.PackOptions,
+    BuiltinOptions.LogicalOrOptions: LogicalOrOptions.LogicalOrOptions,
+    BuiltinOptions.OneHotOptions: OneHotOptions.OneHotOptions,
+    BuiltinOptions.LogicalAndOptions: LogicalAndOptions.LogicalAndOptions,
+    BuiltinOptions.LogicalNotOptions: LogicalNotOptions.LogicalNotOptions,
+    BuiltinOptions.UnpackOptions: UnpackOptions.UnpackOptions,
+    BuiltinOptions.FloorDivOptions: FloorDivOptions.FloorDivOptions,
+    BuiltinOptions.SquareOptions: SquareOptions.SquareOptions,
+    BuiltinOptions.ZerosLikeOptions: ZerosLikeOptions.ZerosLikeOptions,
+    BuiltinOptions.FillOptions: FillOptions.FillOptions,
+    BuiltinOptions.BidirectionalSequenceLSTMOptions: BidirectionalSequenceLSTMOptions.BidirectionalSequenceLSTMOptions,
+    BuiltinOptions.BidirectionalSequenceRNNOptions: BidirectionalSequenceRNNOptions.BidirectionalSequenceRNNOptions,
+    BuiltinOptions.UnidirectionalSequenceLSTMOptions: UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptions,  # noqa: E501
+    BuiltinOptions.FloorModOptions: FloorModOptions.FloorModOptions,
+    BuiltinOptions.RangeOptions: RangeOptions.RangeOptions,
+    BuiltinOptions.ResizeNearestNeighborOptions: ResizeNearestNeighborOptions.ResizeNearestNeighborOptions,
+    BuiltinOptions.LeakyReluOptions: LeakyReluOptions.LeakyReluOptions,
+    BuiltinOptions.SquaredDifferenceOptions: SquaredDifferenceOptions.SquaredDifferenceOptions,
+    BuiltinOptions.MirrorPadOptions: MirrorPadOptions.MirrorPadOptions,
+    BuiltinOptions.AbsOptions: AbsOptions.AbsOptions,
+    BuiltinOptions.SplitVOptions: SplitVOptions.SplitVOptions,
+    BuiltinOptions.UniqueOptions: UniqueOptions.UniqueOptions,
+    BuiltinOptions.ReverseV2Options: ReverseV2Options.ReverseV2Options,
+    BuiltinOptions.AddNOptions: AddNOptions.AddNOptions,
+    BuiltinOptions.GatherNdOptions: GatherNdOptions.GatherNdOptions,
+    BuiltinOptions.CosOptions: CosOptions.CosOptions,
+    BuiltinOptions.WhereOptions: WhereOptions.WhereOptions,
+    BuiltinOptions.RankOptions: RankOptions.RankOptions,
+    BuiltinOptions.ReverseSequenceOptions: ReverseSequenceOptions.ReverseSequenceOptions,
+    BuiltinOptions.MatrixDiagOptions: MatrixDiagOptions.MatrixDiagOptions,
+    BuiltinOptions.QuantizeOptions: QuantizeOptions.QuantizeOptions,
+    BuiltinOptions.MatrixSetDiagOptions: MatrixSetDiagOptions.MatrixSetDiagOptions,
+    BuiltinOptions.DensifyOptions: DensifyOptions.DensifyOptions,
+    BuiltinOptions.DepthToSpaceOptions: DepthToSpaceOptions.DepthToSpaceOptions,
+    BuiltinOptions.IfOptions: IfOptions.IfOptions,
+    BuiltinOptions.NonMaxSuppressionV4Options: NonMaxSuppressionV4Options.NonMaxSuppressionV4Options,
+    BuiltinOptions.NonMaxSuppressionV5Options: NonMaxSuppressionV5Options.NonMaxSuppressionV5Options,
+    BuiltinOptions.ScatterNdOptions: ScatterNdOptions.ScatterNdOptions,
+    BuiltinOptions.SegmentSumOptions: SegmentSumOptions.SegmentSumOptions,
+    BuiltinOptions.SelectV2Options: SelectV2Options.SelectV2Options,
+    BuiltinOptions.WhileOptions: WhileOptions.WhileOptions,
+}
+
+builtin_options_inv_map = inverse_map(builtin_options_map)
+
+
+def underscore_to_camel_case(s):
+    return "".join(x.title() for x in s.split("_"))
+
+
+def padding_deserialize(x):
+    return padding_map[x]
+
+
+def padding_serialize(builder, x):
+    return padding_inv_map[x]
+
+
+def activation_deserialize(x):
+    return activation_function_map[x]
+
+
+def activation_serialize(builder, x):
+    return activation_function_inv_map[x]
+
+
+def datatype_deserialize(x):
+    return datatype_map[x]
+
+
+def datatype_serialize(builder, x):
+    return datatype_inv_map[x]
+
+
+def identity(x):
+    return x
+
+
+def identity_serialize(builder, x):
+    return x
+
+
+def write_byte_vector(builder, v):
+    builder.StartVector(1, len(v), 1)
+    for e in v[::-1]:
+        builder.PrependByte(e)
+    return builder.EndVector(len(v))
+
+
+def write_int_vector(builder, v):
+    builder.StartVector(4, len(v), 4)
+    for e in v[::-1]:
+        builder.PrependInt32(e)
+    return builder.EndVector(len(v))
+
+
+class OptionsSerializer:
+    def __init__(self, name, members=[]):
+        self.name = name
+        self.module = globals()[self.name]
+        self.cls = getattr(self.module, self.name)
+        self.builtin_opt_type = builtin_options_inv_map[self.cls]
+        self.custom_opt_format = 0
+        self.members = []
+        for mem in members:
+            deserialize = identity
+            serialize = identity_serialize
+            is_vector = False
+            if isinstance(mem, tuple):
+                if len(mem) == 3:
+                    mem, deserialize, serialize = mem
+                elif len(mem) == 2:
+                    mem, is_vector = mem
+                    deserialize = tuple
+                    serialize = write_int_vector
+                else:
+                    assert 0
+            underscore_mem = mem
+            camelcase_mem = underscore_to_camel_case(mem)
+            self.members.append((underscore_mem, camelcase_mem, deserialize, serialize, is_vector))
+
+    def deserialize(self, builtin_data, custom_data):
+        attrs = {}
+        if builtin_data:
+            tfattrs = self.cls()
+            tfattrs.Init(builtin_data.Bytes, builtin_data.Pos)
+            for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members:
+                fun = camelcase_mem
+                if is_vector:
+                    fun += "AsNumpy"
+
+                a = deserialize(getattr(tfattrs, fun)())
+                attrs[underscore_mem] = a
+        return attrs
+
+    def serialize(self, builder, attrs):
+        ser_attrs = []
+        for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members:
+            a = serialize(builder, attrs[underscore_mem])
+            ser_attrs.append((camelcase_mem, a))
+
+        getattr(self.module, self.name + "Start")(builder)
+
+        for camelcase_mem, a in ser_attrs:
+            getattr(self.module, self.name + "Add" + camelcase_mem)(builder, a)
+
+        return getattr(self.module, self.name + "End")(builder), None
+
+
+class CustomOptionsSerializer:
+    def __init__(self):
+        self.builtin_opt_type = 0
+        self.custom_opt_format = 0
+
+    def deserialize(self, builtin_data, custom_data):
+        attrs = {}
+        attrs["custom_options"] = custom_data
+        return attrs
+
+    def serialize(self, builder, attrs):
+
+        custom_opts = attrs.get("custom_options", [])
+        custom_data = []
+
+        # Set NPU op custom options for the TensorFlow Lite custom operator
+        if custom_opts["type"] == "NpuOp":
+            custom_data = [0x01, 0x04, 0x01]  # NpuOp=1, FlexbufferFormat.UINT8=4, byte length=1
+
+        custom_data_bytes = struct.pack("<{0}B".format(len(custom_data)), *custom_data)
+        custom_offset = write_byte_vector(builder, custom_data_bytes)
+
+        return None, custom_offset
+
+
+padding_map = {
+    Padding.SAME: b"SAME",
+    Padding.VALID: b"VALID",
+}
+
+padding_inv_map = inverse_map(padding_map)
+
+
+activation_function_map = {
+    ActivationFunctionType.NONE: None,
+    ActivationFunctionType.RELU: "Relu",
+    ActivationFunctionType.RELU_N1_TO_1: "ReluN1To1",
+    ActivationFunctionType.RELU6: "Relu6",
+    ActivationFunctionType.TANH: "Tanh",
+    ActivationFunctionType.SIGN_BIT: "SignBit",
+}
+
+activation_function_inv_map = inverse_map(activation_function_map)
+
+fused_act = ("fused_activation_function", activation_deserialize, activation_serialize)
+padding = ("padding", padding_deserialize, padding_serialize)
+
+pool2d_opts = OptionsSerializer(
+    "Pool2DOptions", (padding, "stride_w", "stride_h", "filter_width", "filter_height", fused_act,)
+)
+
+depthwise_opts = OptionsSerializer(
+    "DepthwiseConv2DOptions",
+    (padding, "stride_w", "stride_h", "depth_multiplier", fused_act, "dilation_w_factor", "dilation_h_factor",),
+)
+
+conv2d_opts = OptionsSerializer(
+    "Conv2DOptions", (padding, "stride_w", "stride_h", fused_act, "dilation_w_factor", "dilation_h_factor",)
+)
+
+lstm_opts = OptionsSerializer("LSTMOptions", (fused_act, "cell_clip", "proj_clip", "kernel_type"))
+
+unidir_seq_lstm_opts = OptionsSerializer(
+    "UnidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "time_major")
+)
+
+bidir_seq_lstm_opts = OptionsSerializer(
+    "BidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "merge_outputs", "time_major")
+)
+
+rnn_opts = OptionsSerializer("RNNOptions", (fused_act,))
+
+seq_rnn_opts = OptionsSerializer("SequenceRNNOptions", ("time_major", fused_act,))
+
+bidir_seq_rnn_opts = OptionsSerializer("BidirectionalSequenceRNNOptions", ("time_major", fused_act, "merge_outputs",))
+
+
+reducer_opts = OptionsSerializer("ReducerOptions", ("keep_dims",))
+
+is_int_vec = True
+
+custom_prefix = "Custom_"
+
+builtin_operator_map = {
+    BuiltinOperator.ADD: ("AddAct", OptionsSerializer("AddOptions", (fused_act,))),
+    BuiltinOperator.AVERAGE_POOL_2D: ("AvgPoolAct", pool2d_opts),
+    BuiltinOperator.CONCATENATION: ("ConcatTFLite", OptionsSerializer("ConcatenationOptions", ("axis", fused_act))),
+    BuiltinOperator.CONV_2D: ("Conv2DBiasAct", conv2d_opts),
+    BuiltinOperator.DEPTHWISE_CONV_2D: ("DepthwiseConv2dBiasAct", depthwise_opts),
+    BuiltinOperator.DEPTH_TO_SPACE: ("DepthToSpace", OptionsSerializer("DepthToSpaceOptions", ("block_size",))),
+    BuiltinOperator.DEQUANTIZE: ("Dequantize", OptionsSerializer("DequantizeOptions")),
+    BuiltinOperator.EMBEDDING_LOOKUP: (None, None),
+    BuiltinOperator.FLOOR: ("Floor", None),
+    BuiltinOperator.FULLY_CONNECTED: (
+        "FullyConnectedAct",
+        OptionsSerializer("FullyConnectedOptions", (fused_act, "weights_format")),
+    ),
+    BuiltinOperator.HASHTABLE_LOOKUP: (None, None),
+    # BuiltinOperator.L2_NORMALIZATION : "L2NormAct",
+    BuiltinOperator.L2_POOL_2D: (None, pool2d_opts),
+    BuiltinOperator.LOCAL_RESPONSE_NORMALIZATION: (
+        "LRN",
+        OptionsSerializer("LocalResponseNormalizationOptions", ("radius", "bias", "alpha", "beta")),
+    ),
+    BuiltinOperator.LOGISTIC: ("Sigmoid", None),
+    # BuiltinOperator.LSH_PROJECTION : "",
+    BuiltinOperator.LSTM: ("LstmAct", lstm_opts),
+    BuiltinOperator.MAX_POOL_2D: ("MaxPool", pool2d_opts),
+    BuiltinOperator.MUL: ("MulAct", OptionsSerializer("MulOptions", (fused_act,))),
+    BuiltinOperator.RELU: ("Relu", None),
+    BuiltinOperator.RELU_N1_TO_1: (None, None),
+    BuiltinOperator.RELU6: ("Relu6", None),
+    BuiltinOperator.RESHAPE: ("Reshape", OptionsSerializer("ReshapeOptions", (("new_shape", is_int_vec),))),
+    BuiltinOperator.RESIZE_BILINEAR: (
+        "ResizeBilinear",
+        OptionsSerializer("ResizeBilinearOptions", ("align_corners", "half_pixel_centers")),
+    ),
+    BuiltinOperator.RNN: ("RnnAct", rnn_opts),
+    BuiltinOperator.SOFTMAX: ("Softmax", OptionsSerializer("SoftmaxOptions", ("beta",))),
+    BuiltinOperator.SPACE_TO_DEPTH: ("SpaceToDepth", OptionsSerializer("SpaceToDepthOptions", ("block_size",))),
+    BuiltinOperator.SVDF: ("SvdfAct", OptionsSerializer("SVDFOptions", ("rank", fused_act))),
+    BuiltinOperator.TANH: ("Tanh", None),
+    # BuiltinOperator.CONCAT_EMBEDDINGS : "",
+    # BuiltinOperator.SKIP_GRAM : "",
+    # BuiltinOperator.CALL : "",
+    BuiltinOperator.EMBEDDING_LOOKUP_SPARSE: (None, OptionsSerializer("EmbeddingLookupSparseOptions", ("combiner",))),
+    BuiltinOperator.PAD: ("Pad", OptionsSerializer("PadOptions")),
+    BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: ("UnidirectionalSequenceRnnAct", seq_rnn_opts),
+    BuiltinOperator.GATHER: ("GatherV2", OptionsSerializer("GatherOptions", ("axis",))),
+    BuiltinOperator.BATCH_TO_SPACE_ND: ("BatchToSpaceND", OptionsSerializer("BatchToSpaceNDOptions")),
+    BuiltinOperator.SPACE_TO_BATCH_ND: ("SpaceToBatchND", OptionsSerializer("SpaceToBatchNDOptions")),
+    BuiltinOperator.TRANSPOSE: ("Transpose", OptionsSerializer("TransposeOptions")),
+    BuiltinOperator.MEAN: ("Mean", None),
+    BuiltinOperator.SUB: ("SubAct", OptionsSerializer("SubOptions", (fused_act,))),
+    BuiltinOperator.DIV: ("DivAct", OptionsSerializer("DivOptions", (fused_act,))),
+    BuiltinOperator.SQUEEZE: ("Squeeze", OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),))),
+    BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: ("UnidirectionalSequenceLstmAct", unidir_seq_lstm_opts),
+    BuiltinOperator.STRIDED_SLICE: (
+        "StridedSlice",
+        OptionsSerializer(
+            "StridedSliceOptions", ("begin_mask", "end_mask", "ellipsis_mask", "new_axis_mask", "shrink_axis_mask")
+        ),
+    ),
+    BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: ("BidirectionalSequenceRnnAct", bidir_seq_rnn_opts),
+    BuiltinOperator.EXP: ("Exp", OptionsSerializer("ExpOptions")),
+    BuiltinOperator.TOPK_V2: ("TopKV2", OptionsSerializer("TopKV2Options")),
+    BuiltinOperator.SPLIT: ("Split", OptionsSerializer("SplitOptions", ("num_splits",))),
+    BuiltinOperator.LOG_SOFTMAX: ("LogSoftmax", OptionsSerializer("LogSoftmaxOptions")),
+    # BuiltinOperator.DELEGATE : "",
+    BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM: ("BidirectionalSequenceLstmAct", bidir_seq_lstm_opts),
+    BuiltinOperator.CAST: (
+        "Cast",
+        OptionsSerializer(
+            "CastOptions",
+            (
+                ("in_data_type", datatype_deserialize, datatype_serialize),
+                ("out_data_type", datatype_deserialize, datatype_serialize),
+            ),
+        ),
+    ),
+    # BuiltinOperator.PRELU : "",
+    BuiltinOperator.MAXIMUM: ("Maximum", OptionsSerializer("MaximumMinimumOptions")),
+    BuiltinOperator.ARG_MAX: (
+        "ArgMax",
+        OptionsSerializer("ArgMaxOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
+    ),
+    BuiltinOperator.MINIMUM: ("Minimum", OptionsSerializer("MaximumMinimumOptions")),
+    BuiltinOperator.LESS: ("Less", None),
+    BuiltinOperator.NEG: ("Neg", None),
+    BuiltinOperator.PADV2: ("PadV2", None),
+    BuiltinOperator.GREATER: ("Greater", None),
+    BuiltinOperator.GREATER_EQUAL: ("GreaterEqual", None),
+    BuiltinOperator.LESS_EQUAL: ("LessEqual", None),
+    BuiltinOperator.SELECT: ("Select", None),
+    BuiltinOperator.SLICE: ("Slice", None),
+    BuiltinOperator.SIN: ("Sin", None),
+    BuiltinOperator.TRANSPOSE_CONV: (
+        "Conv2DBackpropInput",
+        OptionsSerializer("TransposeConvOptions", (padding, "stride_w", "stride_h")),
+    ),
+    BuiltinOperator.SPARSE_TO_DENSE: (
+        "SparseToDense",
+        OptionsSerializer("SparseToDenseOptions", ("validate_indices",)),
+    ),
+    BuiltinOperator.TILE: ("Tile", OptionsSerializer("TileOptions")),
+    BuiltinOperator.EXPAND_DIMS: ("ExpandDims", None),
+    BuiltinOperator.EQUAL: ("Equal", None),
+    BuiltinOperator.NOT_EQUAL: ("NotEqual", None),
+    BuiltinOperator.LOG: ("Log", None),
+    BuiltinOperator.SUM: ("Sum", None),
+    BuiltinOperator.SQRT: ("Sqrt", None),
+    BuiltinOperator.RSQRT: ("Rsqrt", None),
+    BuiltinOperator.SHAPE: (
+        "Shape",
+        OptionsSerializer("ShapeOptions", (("out_type", datatype_deserialize, datatype_serialize),)),
+    ),
+    BuiltinOperator.POW: "Pow",
+    BuiltinOperator.ARG_MIN: (
+        "ArgMin",
+        OptionsSerializer("ArgMinOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
+    ),
+    BuiltinOperator.FAKE_QUANT: (
+        "FakeQuantWithMinMaxArgs",
+        OptionsSerializer("FakeQuantOptions", ("min", "max", "num_bits", "narrow_range")),
+    ),
+    BuiltinOperator.REDUCE_PROD: ("Prod", reducer_opts),
+    BuiltinOperator.REDUCE_MAX: ("Max", reducer_opts),
+    BuiltinOperator.PACK: ("Pack", OptionsSerializer("PackOptions", ("values_count", "axis"))),
+    BuiltinOperator.LOGICAL_OR: ("LogicalOr", None),
+    BuiltinOperator.ONE_HOT: ("OneHot", OptionsSerializer("OneHotOptions", ("axis",))),
+    BuiltinOperator.LOGICAL_AND: ("LogicalAnd", None),
+    BuiltinOperator.LOGICAL_NOT: ("LogicalNot", None),
+    BuiltinOperator.UNPACK: ("Unpack", OptionsSerializer("UnpackOptions", ("num", "axis"))),
+    BuiltinOperator.REDUCE_MIN: ("Min", reducer_opts),
+    BuiltinOperator.FLOOR_DIV: ("FloorDiv", None),
+    BuiltinOperator.REDUCE_ANY: ("Any", reducer_opts),
+    BuiltinOperator.SQUARE: ("Square", None),
+    BuiltinOperator.ZEROS_LIKE: ("ZerosLike", None),
+    BuiltinOperator.FILL: ("Fill", None),
+    BuiltinOperator.FLOOR_MOD: ("FloorMod", None),
+    BuiltinOperator.RANGE: ("Range", None),
+    BuiltinOperator.RESIZE_NEAREST_NEIGHBOR: (
+        "ResizeNearestNeighbor",
+        OptionsSerializer("ResizeNearestNeighborOptions", ("align_corners",)),
+    ),
+    BuiltinOperator.LEAKY_RELU: ("LeakyRelu", OptionsSerializer("LeakyReluOptions", ("alpha",))),
+    BuiltinOperator.SQUARED_DIFFERENCE: ("SquaredDifference", None),
+    BuiltinOperator.MIRROR_PAD: ("MirrorPad", OptionsSerializer("MirrorPadOptions", ("mode",))),
+    BuiltinOperator.ABS: ("Abs", None),
+    BuiltinOperator.SPLIT_V: ("SplitV", OptionsSerializer("SplitVOptions", ("num_splits",))),
+    BuiltinOperator.UNIQUE: (
+        "Unique",
+        OptionsSerializer("UniqueOptions", (("idx_out_type", datatype_deserialize, datatype_serialize),)),
+    ),
+    BuiltinOperator.CEIL: ("Ceil", None),
+    BuiltinOperator.REVERSE_V2: ("ReverseV2", None),
+    BuiltinOperator.ADD_N: ("AddN", None),
+    BuiltinOperator.GATHER_ND: ("GatherNd", None),
+    BuiltinOperator.COS: ("Cos", None),
+    BuiltinOperator.WHERE: ("Where", None),
+    BuiltinOperator.RANK: ("Rank", None),
+    BuiltinOperator.ELU: ("Elu", None),
+    BuiltinOperator.REVERSE_SEQUENCE: (
+        "ReverseSequence",
+        OptionsSerializer("ReverseSequenceOptions", ("seq_dim", "batch_dim")),
+    ),
+    BuiltinOperator.MATRIX_DIAG: ("MatrixDiag", None),
+    BuiltinOperator.QUANTIZE: ("Quantize", None),
+    BuiltinOperator.MATRIX_SET_DIAG: ("MatrixSetDiag", None),
+    BuiltinOperator.IF: ("If", OptionsSerializer("IfOptions", ("then_subgraph_index", "else_subgraph_index"))),
+    BuiltinOperator.WHILE: ("While", OptionsSerializer("WhileOptions", ("cond_subgraph_index", "body_subgraph_index"))),
+    BuiltinOperator.NON_MAX_SUPPRESSION_V4: ("NonMaxSuppressionV4", OptionsSerializer("NonMaxSuppressionV4Options")),
+    BuiltinOperator.NON_MAX_SUPPRESSION_V5: ("NonMaxSuppressionV5", OptionsSerializer("NonMaxSuppressionV5Options")),
+    BuiltinOperator.SCATTER_ND: ("ScatterNd", OptionsSerializer("ScatterNdOptions")),
+    BuiltinOperator.SELECT_V2: ("SelectV2", OptionsSerializer("SelectV2Options")),
+    BuiltinOperator.DENSIFY: ("Densify", OptionsSerializer("DensifyOptions")),
+    BuiltinOperator.SEGMENT_SUM: ("SegmentSum", OptionsSerializer("SegmentSumOptions")),
+    BuiltinOperator.CUSTOM: (custom_prefix, CustomOptionsSerializer()),
+}
+
+builtin_operator_inv_map = {v[0]: (k, v[1]) for k, v in builtin_operator_map.items()}
+
+builtin_operator_inv_map["NpuOp"] = (BuiltinOperator.CUSTOM, CustomOptionsSerializer())
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
new file mode 100644
index 0000000..535847d
--- /dev/null
+++ b/ethosu/vela/tflite_reader.py
@@ -0,0 +1,252 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions used to read from a TensorFlow Lite format file.
+
+from .tflite.Model import Model
+from .tflite.BuiltinOperator import BuiltinOperator
+
+import numpy as np
+import os.path
+from .nn_graph import Graph, Operation, Subgraph
+from .tensor import Tensor, QuantizationParameters
+
+from .tflite_mapping import builtin_operator_map, datatype_map, datatype_map_numpy, DataType
+
+
+def decode_str(s):
+    if s is None:
+        return ""
+    return s.decode("utf-8")
+
+
+def reshape_tensor_add_const_op(tens, reorder):
+    if not tens.reshaped:
+        original_shape = tens.shape
+        tens.name = tens.name + "_reshape"
+        tens.shape = [original_shape[idx] for idx in reorder]
+        tens.bandwidth_shape = tens.shape
+        tens.storage_shape = tens.shape
+
+        if tens.values is not None:
+            tens.values = tens.values.transpose(reorder)
+
+        if tens.quant_values is not None:
+            tens.quant_values = tens.quant_values.transpose(reorder)
+
+        op = Operation("Const", tens.name)
+        op.outputs = [tens]
+        tens.ops = [op]
+        tens.reshaped = True
+
+
+class TFLiteSubgraph:
+    def __init__(self, graph, subgraph):
+        self.graph = graph
+        self.name = decode_str(subgraph.Name())
+
+        self.tensors = []
+        for idx in range(subgraph.TensorsLength()):
+            self.tensors.append(self.parse_tensor(subgraph.Tensors(idx)))
+
+        for idx in range(subgraph.OperatorsLength()):
+            self.parse_operator(subgraph.Operators(idx))
+
+        self.outputs = [self.tensors[idx] for idx in subgraph.OutputsAsNumpy()]
+        self.inputs = [self.tensors[idx] for idx in subgraph.InputsAsNumpy()]
+
+        # Fix up tensors without operations. Generate either Placeholder or Constant ops
+        for tens in self.inputs:
+            assert not tens.ops
+            op = Operation("Placeholder", tens.name)
+            op.outputs = [tens]
+            tens.ops = [op]
+
+        for tens in self.tensors:
+            if not tens.ops:
+                op = Operation("Const", tens.name)
+                op.outputs = [tens]
+                tens.ops = [op]
+
+    def parse_tensor(self, tens_data):
+        np_shape = tens_data.ShapeAsNumpy()
+        shape = list(np_shape) if type(np_shape) is np.ndarray else []
+        name = decode_str(tens_data.Name())
+        dtype = datatype_map[tens_data.Type()]
+
+        tens = Tensor(shape, dtype, name)
+
+        quant = tens_data.Quantization()
+
+        def len1_array_to_scalar(arr):
+            # The following flatbuffer quantisation fields all return a scalar value of 0 if they are not definied in
+            # the input buffer. This is represented in Vela by using None.
+            # Otherwise, the fields returned are a single or multi-element array. In which case, single element arrays
+            # are converted to scalars
+            if isinstance(arr, int) and arr == 0:
+                return None
+            if len(arr) == 1:
+                return arr[0]
+            return arr
+
+        tens.quantization = QuantizationParameters()
+        tens.quantization.min = len1_array_to_scalar(quant.MinAsNumpy())
+        tens.quantization.max = len1_array_to_scalar(quant.MaxAsNumpy())
+        tens.quantization.scale_f32 = len1_array_to_scalar(quant.ScaleAsNumpy())
+        tens.quantization.zero_point = len1_array_to_scalar(quant.ZeroPointAsNumpy())
+
+        if dtype == DataType.uint8:
+            tens.quantization.quant_min = 0
+            tens.quantization.quant_max = (1 << dtype.bits) - 1
+        elif dtype in set((DataType.int8, DataType.int16, DataType.int32, DataType.int64)):
+            tens.quantization.quant_min = -(1 << (dtype.bits - 1))
+            tens.quantization.quant_max = (1 << (dtype.bits - 1)) - 1
+        else:
+            raise Exception("DataType '" + str(dtype) + "' is not supported for quantization.")
+
+        if tens.quantization.scale_f32 is None and tens.quantization.zero_point is None:
+            tens.quantization = None
+
+        tens.values = None
+        buf = self.graph.buffers[tens_data.Buffer()]
+        if buf is not None:
+            tens.values = np.array(buf.view(datatype_map_numpy[tens_data.Type()]).reshape(shape))
+            if tens.quantization is not None:
+                tens.quant_values = tens.values
+                tens.values = tens.quantization.dequantize(tens.quant_values)
+        return tens
+
+    def parse_operator(self, op_data):
+        op_type, opt_serializer = self.graph.operator_codes[op_data.OpcodeIndex()]
+        inputs = [self.tensors[idx] for idx in op_data.InputsAsNumpy()]
+        outputs = [self.tensors[idx] for idx in op_data.OutputsAsNumpy()]
+        name = "unknown_op_name"
+        if len(outputs):
+            name = outputs[0].name
+        op = Operation(op_type, name)
+        op.inputs = inputs
+        op.outputs = outputs
+        for out in op.outputs:
+            out.ops = [op]
+
+        activation_function_to_split_out = None
+
+        if op_type.startswith("DepthwiseConv2d") or op_type.startswith("Conv2D"):
+            reshape_tensor_add_const_op(inputs[1], (1, 2, 3, 0))
+
+        if op_type.startswith("FullyConnected"):
+            reshape_tensor_add_const_op(inputs[1], (1, 0))
+
+        if opt_serializer is not None:
+            op.attrs = opt_serializer.deserialize(op_data.BuiltinOptions(), op_data.CustomOptionsAsNumpy())
+
+            if "stride_w" in op.attrs:
+                op.attrs["strides"] = (1, op.attrs["stride_h"], op.attrs["stride_w"], 1)
+            if "filter_width" in op.attrs:
+                op.attrs["ksize"] = (1, op.attrs["filter_height"], op.attrs["filter_width"], 1)
+            if "dilation_w_factor" in op.attrs:
+                op.attrs["dilation"] = (1, op.attrs["dilation_h_factor"], op.attrs["dilation_w_factor"], 1)
+            if "depth_multiplier" in op.attrs:
+                op.attrs["channel_multiplier"] = op.attrs["depth_multiplier"]
+
+            if "fused_activation_function" in op.attrs:
+                if op_type in set(("ConcatTFLite",)):
+                    act = op.attrs["fused_activation_function"]
+                    del op.attrs["fused_activation_function"]
+                    if act is not None:
+                        activation_function_to_split_out = act
+
+        if activation_function_to_split_out is not None:
+            act_op = Operation(activation_function_to_split_out, name + activation_function_to_split_out)
+            out_tens = op.outputs[0]
+            intermediate_tens = out_tens.clone("_act_intermediate")
+            out_tens.ops = [act_op]
+            act_op.outputs = [out_tens]
+            intermediate_tens.ops = [op]
+            op.outputs[0] = intermediate_tens
+            act_op.inputs = [intermediate_tens]
+
+
+class TFLiteGraph:
+    def __init__(
+        self,
+        filename,
+        batch_size=1,
+        feed_dict={},
+        output_node_names=[],
+        initialisation_nodes=[],
+    ):
+
+        self.op_times = {}
+        if batch_size is None:
+            batch_size = 1
+        self.batch_size = batch_size
+        self.name = os.path.splitext(os.path.basename(filename))[0]
+        self.initialisation_nodes = initialisation_nodes
+
+        with open(filename, "rb") as f:
+            buf = bytearray(f.read())
+
+        model = Model.GetRootAsModel(buf, 0)
+
+        self.buffers = []
+        for idx in range(model.BuffersLength()):
+            self.buffers.append(self.parse_buffer(model.Buffers(idx)))
+
+        self.operator_codes = []
+        for idx in range(model.OperatorCodesLength()):
+            self.operator_codes.append(self.parse_operator_code(model.OperatorCodes(idx)))
+
+        self.subgraphs = []
+        for idx in range(model.SubgraphsLength()):
+            self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx)))
+
+        self.nng = Graph(self.name, self.batch_size)
+        for tflite_sg in self.subgraphs:
+            sg = Subgraph(tflite_sg.name)
+            sg.original_inputs = tflite_sg.inputs  # Preserve the original input order
+            sg.output_tensors = tflite_sg.outputs
+            self.nng.subgraphs.append(sg)
+
+    def parse_buffer(self, buf_data):
+        if buf_data.DataLength() == 0:
+            return None
+        data = buf_data.DataAsNumpy()
+        return data
+
+    def parse_operator_code(self, code):
+        c = code.BuiltinCode()
+        op_type, ser = builtin_operator_map[c]
+        if c == BuiltinOperator.CUSTOM:
+            op_type += decode_str(code.CustomCode())
+        return op_type, ser
+
+
+def read_tflite(
+    filename,
+    batch_size=1,
+    feed_dict={},
+    output_node_names=[],
+    initialisation_nodes=[],
+):
+    tflite_graph = TFLiteGraph(
+        filename, batch_size, feed_dict, output_node_names, initialisation_nodes
+    )
+    nng = tflite_graph.nng
+    nng.refresh_after_modification()
+    return nng
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
new file mode 100644
index 0000000..f55d1ce
--- /dev/null
+++ b/ethosu/vela/tflite_writer.py
@@ -0,0 +1,424 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions used to write to a TensorFlow Lite format file. Supports adding in file identifiers.
+
+import flatbuffers
+
+from .tflite import Tensor
+from .tflite import QuantizationParameters
+from .tflite import Model
+from .tflite import SubGraph
+from .tflite import OperatorCode
+from .tflite import Operator
+from .tflite import Buffer
+from .tflite import Metadata
+
+import numpy as np
+
+from .tflite_mapping import datatype_inv_map, builtin_operator_inv_map, custom_prefix, BuiltinOperator
+from .nn_graph import PassPlacement
+from .tensor import TensorPurpose, MemArea
+from flatbuffers.builder import UOffsetTFlags
+
+tflite_version = 3
+tflite_file_identifier = "TFL" + str(tflite_version)
+
+
+import flatbuffers.number_types as N
+from flatbuffers import encode
+
+
+def FinishWithFileIdentifier(self, rootTable, fid):
+    if fid is None or len(fid) != 4:
+        raise Exception("fid must be 4 chars")
+
+    flags = N.Uint8Flags
+    prepSize = 4
+    self.Prep(self.minalign, prepSize + len(fid))
+    for i in range(3, -1, -1):
+        self.head = self.head - flags.bytewidth
+        encode.Write(flags.packer_type, self.Bytes, self.Head(), ord(fid[i]))
+
+    return self.Finish(rootTable)
+
+
+flatbuffers.Builder.FinishWithFileIdentifier = FinishWithFileIdentifier
+
+
+def make_vector(v):
+    try:
+        len(v)
+        return v
+    except TypeError:
+        return [v]
+
+
+class TFLiteSerialiser:
+    def __init__(self, nng):
+        self.builder = flatbuffers.Builder(0)
+        self.nng = nng
+
+        self.scratch_buf_id = 0  # Always assign scratch to buffer 0
+        self.buffer_offsets_map = {}
+        self.buffers_to_write = []  # have an empty array there
+
+        self.input_tensors = []
+        self.ops_to_ignore = set(("Const", "Placeholder", "SubgraphInput"))
+
+        self.tensors_to_reshape = {}
+
+        self.subgraphs_to_write = [sg for sg in self.nng.subgraphs if sg.placement == PassPlacement.Cpu]
+
+        all_ops = []
+        for sg in self.subgraphs_to_write:
+            for ps in sg.passes:
+                for op in ps.ops:
+                    if op.type not in self.ops_to_ignore:
+                        all_ops.append(op)
+                    if op.type.startswith("Conv2D") or op.type.startswith("DepthwiseConv2d"):
+                        self.tensors_to_reshape[op.inputs[1]] = (3, 0, 1, 2)
+                    if op.type.startswith("FullyConnected"):
+                        self.tensors_to_reshape[op.inputs[1]] = (1, 0)
+
+        self.operator_codes = list(sorted(set(op.type for op in all_ops)))
+        self.operator_code_map = {}
+
+    def write_byte_vector(self, v, alignment=1):
+        builder = self.builder
+        builder.StartVector(1, len(v), alignment)
+        for e in v[::-1]:
+            builder.PrependByte(e)
+        return builder.EndVector(len(v))
+
+    def write_int_vector(self, v):
+        builder = self.builder
+        builder.StartVector(4, len(v), 4)
+        for e in v[::-1]:
+            builder.PrependInt32(e)
+        return builder.EndVector(len(v))
+
+    def write_long_vector(self, v):
+        builder = self.builder
+        builder.StartVector(8, len(v), 8)
+        for e in v[::-1]:
+            builder.PrependInt64(e)
+        return builder.EndVector(len(v))
+
+    def write_float_vector(self, v):
+        builder = self.builder
+        builder.StartVector(4, len(v), 4)
+        for e in v[::-1]:
+            builder.PrependFloat32(e)
+        return builder.EndVector(len(v))
+
+    def write_offset_vector(self, v):
+        builder = self.builder
+        builder.StartVector(4, len(v), 4)
+        for e in v[::-1]:
+            builder.PrependUOffsetTRelative(e)
+        return builder.EndVector(len(v))
+
+    def assign_buffers_to_tensors(self, tensors):
+        buffer_map = {}
+        scratch_tensor = [tens for tens in tensors if tens.purpose == TensorPurpose.Scratch][0]
+        buf_idx = 1
+
+        for tens in tensors:
+            if tens.mem_area == scratch_tensor.mem_area:
+                buffer_map[tens] = self.scratch_buf_id
+            else:
+                buffer_map[tens] = buf_idx
+                buf_idx += 1
+
+        # Initialize buffers_to_write to a length equal to numer of buffers so
+        # they can be appended at the correct index during tensor serialization
+        self.buffers_to_write = [None] * (buf_idx)
+
+        return buffer_map
+
+    def serialise_operator_code(self, idx, code):
+        builder = self.builder
+        custom_code_offset = None
+        if code.startswith(custom_prefix):
+            tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
+            custom_code_offset = builder.CreateString(code[len(custom_prefix) :])
+        else:
+            try:
+                tf_code, opt_serializer = builtin_operator_inv_map[code]
+            except KeyError:
+                print(
+                    "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping, as a custom operation"
+                    % (code,)
+                )
+                tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
+
+            if tf_code == BuiltinOperator.CUSTOM:
+                assert code == "NpuOp"  # Currently only support serialising NPU operators as a custom op
+                custom_code_offset = builder.CreateString("ethos-u")
+
+            self.operator_code_map[code] = (idx, tf_code, opt_serializer)
+
+        OperatorCode.OperatorCodeStart(builder)
+        OperatorCode.OperatorCodeAddBuiltinCode(builder, tf_code)
+        if custom_code_offset is not None:
+            OperatorCode.OperatorCodeAddCustomCode(builder, custom_code_offset)
+
+        return OperatorCode.OperatorCodeEnd(builder)
+
+    def serialise_quantization_parameters(self, quant):
+        builder = self.builder
+
+        min = None
+        max = None
+        scale = None
+        zero_point = None
+        if quant is not None:
+            if quant.min is not None:
+                min = self.write_float_vector(make_vector(quant.min))
+            if quant.max is not None:
+                max = self.write_float_vector(make_vector(quant.max))
+            if quant.scale_f32 is not None:
+                scale = self.write_float_vector(make_vector(quant.scale_f32))
+            if quant.zero_point is not None:
+                zero_point = self.write_long_vector(make_vector(quant.zero_point))
+
+        QuantizationParameters.QuantizationParametersStart(builder)
+        if min is not None:
+            QuantizationParameters.QuantizationParametersAddMin(builder, min)
+        if max is not None:
+            QuantizationParameters.QuantizationParametersAddMax(builder, max)
+        if scale is not None:
+            QuantizationParameters.QuantizationParametersAddScale(builder, scale)
+        if zero_point is not None:
+            QuantizationParameters.QuantizationParametersAddZeroPoint(builder, zero_point)
+        return QuantizationParameters.QuantizationParametersEnd(builder)
+
+    def serialise_tensor(self, tens):
+        builder = self.builder
+        tens_shape = tens.shape
+        values = tens.quant_values
+        if values is None:
+            values = tens.values
+
+        if values is None:
+            values = np.empty(shape=(0), dtype=np.uint8)
+
+        if tens in self.tensors_to_reshape:
+            reorder = self.tensors_to_reshape[tens]
+            tens_shape = [tens_shape[idx] for idx in reorder]
+            values = values.transpose(reorder)
+
+        if tens.purpose == TensorPurpose.Scratch:
+            tens_shape = [0]
+            self.buffers_to_write[self.scratch_buf_id] = values.flatten().view(np.uint8)
+
+        buf_id = self.buffer_map[tens]
+        if buf_id != self.scratch_buf_id:
+            self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
+
+        shape = self.write_int_vector(tens_shape)
+
+        name = builder.CreateString(tens.name)
+        quant = self.serialise_quantization_parameters(tens.quantization)
+
+        Tensor.TensorStart(builder)
+        Tensor.TensorAddShape(builder, shape)
+        Tensor.TensorAddType(builder, datatype_inv_map[tens.dtype])
+        # All tensors must have a valid backing buffer, even if it is empty.
+        # Empty buffers should be kept unique for TensorFlow Lite Micro
+        Tensor.TensorAddBuffer(builder, buf_id)
+        Tensor.TensorAddName(builder, name)
+        Tensor.TensorAddQuantization(builder, quant)
+
+        res = Tensor.TensorEnd(builder)
+        return res
+
+    def serialise_operator(self, op):
+        builder = self.builder
+
+        inputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.inputs])
+        outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.outputs])
+
+        op_idx, tflop, opt_serializer = self.operator_code_map[op.type]
+
+        builtin_opt_offset = None
+        custom_opt_offset = None
+        if opt_serializer is not None:
+            attrs = dict(op.attrs)
+            if "strides" in attrs:
+                attrs["stride_h"] = attrs["strides"][1]
+                attrs["stride_w"] = attrs["strides"][2]
+            if "ksize" in attrs:
+                attrs["filter_height"] = attrs["ksize"][1]
+                attrs["filter_width"] = attrs["ksize"][2]
+            if "dilation" in attrs:
+                attrs["dilation_h_factor"] = attrs["dilation"][1]
+                attrs["dilation_w_factor"] = attrs["dilation"][2]
+            if "channel_multiplier" in attrs:
+                attrs["depth_multiplier"] = attrs["channel_multiplier"]
+
+            builtin_opt_offset, custom_opt_offset = opt_serializer.serialize(builder, attrs)
+
+        mutating_variable_inputs_offset = self.write_byte_vector([])
+        Operator.OperatorStart(builder)
+        Operator.OperatorAddOpcodeIndex(builder, op_idx)
+        Operator.OperatorAddInputs(builder, inputs_offset)
+        Operator.OperatorAddOutputs(builder, outputs_offset)
+
+        if builtin_opt_offset is not None:
+            Operator.OperatorAddBuiltinOptionsType(builder, opt_serializer.builtin_opt_type)
+            Operator.OperatorAddBuiltinOptions(builder, builtin_opt_offset)
+        if custom_opt_offset is not None:
+            Operator.OperatorAddCustomOptions(builder, custom_opt_offset)
+            Operator.OperatorAddCustomOptionsFormat(builder, opt_serializer.custom_opt_format)
+
+        Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset)
+        return Operator.OperatorEnd(builder)
+
+    def serialise_subgraph(self, sg):
+        builder = self.builder
+        tensor_set = set()
+
+        all_ops = []
+        for ps in sg.passes:
+            for op in ps.ops:
+                if op.type not in self.ops_to_ignore:
+                    all_ops.append(op)
+
+        for op in all_ops:
+            for tens in op.inputs + op.outputs:
+                tensor_set.add(tens)
+
+        all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))]
+
+        self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)}
+        self.buffer_map = self.assign_buffers_to_tensors(all_tensors)
+
+        tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors])
+
+        # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro
+        scratch_tensor_idx = [v for k, v in self.tensor_map.items() if k.name.endswith("scratch")]
+
+        # Make sure the input_tensors haven't been modified
+        assert all(inp in sg.original_inputs for inp in sg.input_tensors)
+        inputs_offset = self.write_int_vector(
+            [self.tensor_map[tens] for tens in sg.original_inputs] + scratch_tensor_idx
+        )
+        outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors])
+
+        operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops])
+
+        SubGraph.SubGraphStart(builder)
+        SubGraph.SubGraphAddTensors(builder, tensors_offset)
+        SubGraph.SubGraphAddInputs(builder, inputs_offset)
+        SubGraph.SubGraphAddOutputs(builder, outputs_offset)
+
+        SubGraph.SubGraphAddOperators(builder, operators_offset)
+
+        return SubGraph.SubGraphEnd(builder)
+
+    def write_aligned_bytes(self, buf):
+        builder = self.builder
+        builder.nested = True
+        data = bytes(buf)
+        length_bytes = UOffsetTFlags.py_type(len(data))
+        builder.Prep(16, length_bytes)  # Reserve aligned storage
+        builder.head = UOffsetTFlags.py_type(builder.Head() - length_bytes)  # Update FlatBuffer internal pointer
+        builder.Bytes[builder.Head() : builder.Head() + length_bytes] = data  # Assign bytes to aligned area
+        return builder.EndVector(length_bytes)
+
+    def serialise_buffer(self, buf):
+        builder = self.builder
+        data = None
+        if buf is not None:
+            data = self.write_aligned_bytes(buf)
+        Buffer.BufferStart(builder)
+        if data is not None:
+            Buffer.BufferAddData(builder, data)
+        return Buffer.BufferEnd(builder)
+
+    def serialise_metadata(self, metadata):
+        builder = self.builder
+        name = builder.CreateString(metadata[0])
+
+        Metadata.MetadataStart(builder)
+        Metadata.MetadataAddName(builder, name)
+        Metadata.MetadataAddBuffer(builder, metadata[1])
+
+        return Metadata.MetadataEnd(builder)
+
+    def serialise_model(self):
+        builder = self.builder
+        operator_code_offset = self.write_offset_vector(
+            [self.serialise_operator_code(idx, code) for idx, code in enumerate(self.operator_codes)]
+        )
+
+        description = builder.CreateString("Vela Optimised")
+
+        subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write])
+
+        # Fill the metadata buffer
+        version = np.int32(0)
+        subgraph_idx = np.int32(len(self.subgraphs_to_write))  # Only 1 supported currently
+        nbr_tensors = np.int32(len(self.tensor_map))
+
+        # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
+        offsets = [np.int32(-1)] * nbr_tensors
+
+        # Ensure that the order of the offsets match the order of the tensors
+        for tens, idx in self.tensor_map.items():
+            if tens.mem_area == MemArea.Sram:
+                offsets[idx] = np.int32(tens.address)
+
+        metadata_buffer = np.array([version, subgraph_idx, nbr_tensors] + offsets)
+        self.buffers_to_write.append(metadata_buffer)
+
+        buffers_offset = self.write_offset_vector([self.serialise_buffer(buf) for buf in self.buffers_to_write])
+
+        metadata_list = [("OfflineMemoryAllocation", len(self.buffers_to_write) - 1)]
+        metadata_offset = self.write_offset_vector([self.serialise_metadata(metadata) for metadata in metadata_list])
+
+        Model.ModelStart(builder)
+        Model.ModelAddVersion(builder, tflite_version)
+        Model.ModelAddOperatorCodes(builder, operator_code_offset)
+        Model.ModelAddSubgraphs(builder, subgraph_offset)
+        Model.ModelAddDescription(builder, description)
+        Model.ModelAddBuffers(builder, buffers_offset)
+        Model.ModelAddMetadata(builder, metadata_offset)
+        return Model.ModelEnd(builder)
+
+    def serialise(self):
+
+        model = self.serialise_model()
+
+        self.builder.FinishWithFileIdentifier(model, tflite_file_identifier)
+
+        return self.builder.Output()
+
+    def write(self, filename):
+        with open(self.filename, "wb") as f:
+            f.write(self.serialised_buf)
+
+
+def write_tflite(nng, filename):
+    writer = TFLiteSerialiser(nng)
+    buf = writer.serialise()
+
+    with open(filename, "wb") as f:
+        f.write(buf)
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
new file mode 100644
index 0000000..f07aec8
--- /dev/null
+++ b/ethosu/vela/vela.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Main entry point for the Vela compiler.
+#
+# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
+
+import sys
+import os.path
+import os
+import time
+import subprocess
+import configparser
+import argparse
+import ast
+
+from . import architecture_features
+from . import stats_writer
+from . import tflite_writer
+from . import model_reader
+from . import compiler_driver
+from . import scheduler
+from ._version import __version__
+from .scheduler import ParetoMetric
+from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
+
+
+def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
+    if compiler_options.timing:
+        start = time.time()
+
+    nng = model_reader.read_model(fname, model_reader_options)
+
+    if not nng:
+        print("reading of", fname, "failed")
+        assert False
+
+    if compiler_options.verbose_operators:
+        nng.print_operators()
+
+    if compiler_options.timing:
+        stop = time.time()
+        print("Model reading took %f s" % (stop - start))
+        start = time.time()
+
+    compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
+
+    passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+    stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
+
+    summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+    stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
+
+    stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
+
+    if fname.endswith(".tflite"):
+        tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
+
+    if compiler_options.timing:
+        stop = time.time()
+        print("Compiler driver took %f s" % (stop - start))
+
+    return nng
+
+
+def print_subgraph_io_summary(nng):
+    """Print a summary of all the input and output tensor sizes for all subgraphs.
+    Also displays the total tensor size and the memory used area for sram.
+    """
+
+    print("Subgraph IO Summary")
+    print("-------------------")
+    print("NNG: {0}".format(nng.name))
+    max_sg_size = 0
+    for sg in reversed(nng.subgraphs):
+        print("   Subgraph: {0} = {1}".format(sg.name, sg.placement))
+        sg_size = 0
+
+        if sg.placement == PassPlacement.Npu:
+            for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
+                if tens in sg.input_tensors:
+                    tens_dir = "In"
+                elif tens in sg.output_tensors:
+                    tens_dir = "Out"
+                else:
+                    tens_dir = "In/Out"
+
+                size = tens.elements() * tens.element_size() / 1024.0
+                sg_size = sg_size + size
+                print("         Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
+
+        print("      Total Size = {0} KiB".format(sg_size))
+        print("      SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
+        max_sg_size = max(sg_size, max_sg_size)
+
+    print("   Maximum Subgraph Size = {0} KiB".format(max_sg_size))
+
+
+def main(args=None):
+    if args is None:
+        args = sys.argv[1:]
+
+    parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
+
+    parser.add_argument(
+        "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
+    )
+
+    parser.add_argument("--version", action="version", version=__version__)
+    parser.add_argument(
+        "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
+    )
+    parser.add_argument("--config", type=str, help="Location of vela configuration file")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
+
+    parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
+    parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
+    parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
+    parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
+    parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
+    parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
+    parser.add_argument(
+        "--verbose-pareto-frontier-schedules",
+        action="store_true",
+        help="Show all schedules along the pareto frontier of optimisation criteria",
+    )
+    parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
+    parser.add_argument(
+        "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
+    )
+    parser.add_argument(
+        "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
+    )
+    parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
+
+    parser.add_argument(
+        "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
+    )
+    parser.add_argument(
+        "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
+    )
+    parser.add_argument(
+        "--cascading",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--ifm-ofm-overlap",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
+    )
+    parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
+    parser.add_argument(
+        "--inter-pass-cycle-delay",
+        type=int,
+        default=0,
+        help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
+    )
+    parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
+    parser.add_argument(
+        "--accelerator-config",
+        type=str,
+        default="ethos-u55-256",
+        choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
+        help="Accelerator configuration to use (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--system-config",
+        type=str,
+        default="internal-default",
+        help="System configuration to use (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--dram-bandwidth",
+        type=float,
+        default=0.0,
+        help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--permanent-storage",
+        default=MemArea.OffChipFlash,
+        type=lambda s: MemArea[s],
+        choices=list(MemArea)[3:-1],
+        help=(
+            "Memory area for permanent storage. To store the weights and other constant data in SRAM select "
+            "'OnChipFlash' (default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--tensor-allocator",
+        default=TensorAllocator.Greedy,
+        type=lambda s: TensorAllocator[s],
+        choices=list(TensorAllocator),
+        help="Tensor Allocator algorithm (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--show-subgraph-io-summary",
+        action="store_true",
+        help="Shows a summary of all the subgraphs and their inputs and outputs",
+    )
+    parser.add_argument(
+        "--ifm-streaming",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls scheduler IFM streaming search (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--block-config-limit",
+        type=int,
+        default=16,
+        help="Limit block config search space, use zero for unlimited (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--global-memory-clock-scale",
+        type=float,
+        default=1.0,
+        help=(
+            "Performs an additional scaling of the individual memory clock scales specified by the system config "
+            "(default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--pareto-metric",
+        default=ParetoMetric.BwCycMem,
+        type=lambda s: ParetoMetric[s],
+        choices=list(ParetoMetric),
+        help="Controls the calculation of the pareto metric (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--recursion-limit",
+        type=int,
+        default=10000,
+        help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-block-dependency",
+        type=int,
+        default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
+        choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
+        help=(
+            "Set the maximum value that can be used for the block dependency between npu kernel operations "
+            "(default: %(default)s)"
+        ),
+    )
+
+    args = parser.parse_args(args=args)
+
+    # Read configuration file
+    config_file = args.config
+    config = None
+    if config_file is not None:
+        with open(config_file) as f:
+            config = configparser.ConfigParser()
+            config.read_file(f)
+
+    if args.network is None:
+        parser.error("the following argument is required: NETWORK")
+
+    sys.setrecursionlimit(args.recursion_limit)
+
+    if args.force_block_config:
+        force_block_config = architecture_features.Block.from_string(args.force_block_config)
+    else:
+        force_block_config = None
+
+    arch = architecture_features.ArchitectureFeatures(
+        vela_config=config,
+        system_config=args.system_config,
+        accelerator_config=args.accelerator_config,
+        permanent_storage=args.permanent_storage,
+        inter_pass_cycle_delay=args.inter_pass_cycle_delay,
+        dram_bandwidth=args.dram_bandwidth,
+        override_block_config=force_block_config,
+        block_config_limit=args.block_config_limit,
+        global_memory_clock_scale=args.global_memory_clock_scale,
+        max_blockdep=args.max_block_dependency,
+    )
+
+    compiler_options = compiler_driver.CompilerOptions(
+        verbose_graph=args.verbose_graph,
+        verbose_quantization=args.verbose_quantization,
+        verbose_packing=args.verbose_packing,
+        verbose_tensor_purpose=args.verbose_tensor_purpose,
+        verbose_tensor_format=args.verbose_tensor_format,
+        verbose_allocation=args.verbose_allocation,
+        verbose_high_level_command_stream=args.verbose_high_level_command_stream,
+        verbose_register_command_stream=args.verbose_register_command_stream,
+        verbose_operators=args.verbose_operators,
+        show_minimum_possible_allocation=args.show_minimum_possible_allocation,
+        show_cpu_operations=args.show_cpu_operations,
+        tensor_allocator=args.tensor_allocator,
+        timing=args.timing,
+        output_dir=args.output_dir,
+    )
+
+    scheduler_options = scheduler.SchedulerOptions(
+        use_cascading=args.cascading,
+        use_ifm_ofm_overlap=args.ifm_ofm_overlap,
+        verbose_schedule=args.verbose_schedule,
+        verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
+        use_ifm_streaming=args.ifm_streaming,
+        pareto_metric=args.pareto_metric,
+    )
+
+    model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
+
+    if args.show_subgraph_io_summary:
+        print_subgraph_io_summary(nng)
+
+    return 0
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
new file mode 100644
index 0000000..0b4ac69
--- /dev/null
+++ b/ethosu/vela/weight_compressor.py
@@ -0,0 +1,387 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
+
+import os
+import sys
+import enum
+import math
+import numpy as np
+from collections import namedtuple
+from .numeric_util import round_up
+from .scaling import quantise_scale
+from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal
+from .operation import NpuBlockType
+from .architecture_features import Block
+from .nn_graph import SchedulingStrategy
+from .data_type import DataType
+
+from ethosu import mlw_codec
+
+
+def encode(weight_stream):
+    assert np.amin(weight_stream) >= -255
+    assert np.amax(weight_stream) <= 255
+
+    # Encode flattened signed weight stream
+    compressed = mlw_codec.encode(weight_stream)
+
+    # pad with 0xFF as needed so the length of the weight stream
+    # is a multiple of 16
+  
+    while (len(compressed) % 16) != 0:
+        compressed.append(0xFF)
+
+    return compressed
+
+
+def generate_brick(arch, brick_weights, ofm_block, block_traversal, ifm_bitdepth):
+    is_depthwise = block_traversal == TensorBlockTraversal.DepthWise
+    is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst
+    subkernel_max = arch.subkernel_max
+    ofm_ublock = arch.ofm_ublock
+    ifm_ublock = arch.ifm_ublock
+    # Expect weights formatted HWIO
+    ofm_depth = brick_weights.shape[-1]
+    ifm_depth = brick_weights.shape[-2]
+    kernel_width = brick_weights.shape[-3]
+    kernel_height = brick_weights.shape[-4]
+    # IFM block depth
+    if is_partkernel or (ifm_bitdepth == 16):
+        # IFM block depth is always 16 for part-kernel-first
+        ifm_block_depth = 16
+    elif ifm_bitdepth == 8:
+        ifm_block_depth = 32
+    else:
+        assert False
+
+    stream = []
+
+    # Top level striping - OFM blocks in the entire brick's depth
+    for ofm_block_z in range(0, ofm_depth, ofm_block.depth):
+        clipped_ofm_block_depth = min(ofm_block.depth, ofm_depth - ofm_block_z)
+        # IFM blocks required for the brick
+        for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth):
+            if is_depthwise:
+                clipped_ifm_block_depth = ifm_ublock.depth
+            else:
+                clipped_ifm_block_depth = (
+                    min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth
+                )
+            # Weight decomposition
+            # Subkernel Splitting  (H)
+            for subkernel_y in range(0, kernel_height, subkernel_max.height):
+                sub_height = min(kernel_height - subkernel_y, subkernel_max.height)
+                # Subkernel splitting (W)
+                for subkernel_x in range(0, kernel_width, subkernel_max.width):
+                    sub_width = min(kernel_width - subkernel_x, subkernel_max.width)
+                    subkernel_elements = sub_width * sub_height
+                    # Part kernel first works across the kernel H/W and needs padding
+                    if is_partkernel:
+                        if ifm_bitdepth == 16 and subkernel_elements % 2 != 0:
+                            subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2)
+                        elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0:
+                            subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4)
+
+                    # Depthwise Conv requires multiple of 4 kernel elements in its weight block
+                    # this is different from normal conv which is considered "weights depth-first"
+                    elif is_depthwise:
+                        subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4)
+
+                    ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1
+                    ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth
+                    # IFM Ublocks in IFM-block over depth for part-kernel-first mode
+                    # For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored.
+                    for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth):
+                        # OFM Ublocks in OFM-block over depth
+                        for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth):
+                            # HW Kernel element traversal - cannot be a H/W loop due to element
+                            # padding requirement on depthwise/part-kernel configurations
+                            for element in range(subkernel_elements):
+                                kx = element % sub_width
+                                ky = element // sub_width
+                                # IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)
+                                # In case of part-kernel-first IFM Ublock traversal have already been handled
+                                # and this loop is ignored.
+                                for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth):
+                                    # Feed OFM ublock elements
+                                    for ofm_ublock_z in range(ofm_ublock.depth):
+                                        # Source IFM ublock elements (only 1 element deep if depthwise)
+                                        for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth):
+                                            # Source position within the current subkernel
+                                            wx = subkernel_x + kx
+                                            wy = subkernel_y + ky
+                                            # Source IFM/OFM slices
+                                            ifm_ublk = ifm_ublk_inner + ifm_ublk_outer
+                                            ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z
+                                            ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z
+                                            if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height):
+                                                stream.append(0)
+                                            else:
+                                                stream.append(brick_weights[wy][wx][ifm_z][ofm_z])
+    return stream
+
+
+# Compress the weights
+def compress_weights(tens, arch, npu_block_type, ofm_block, ofm_depth_step, min_val=None, max_val=None):
+    assert tens.purpose == TensorPurpose.Weights
+    assert tens.format == TensorFormat.WeightsCompressed
+
+    WeightCompressionConfig = namedtuple("WeightCompressionConfig", ["npu_block_type", "ofm_block", "ofm_depth_step"])
+
+    # check if weights have already been compressed
+    wcc = tens.weight_compression_config
+    if wcc is not None:
+        assert wcc.npu_block_type == npu_block_type, "Weights not used by the same operator type"
+
+        if wcc.ofm_block == ofm_block and wcc.ofm_depth_step == ofm_depth_step:
+            return
+
+    assert tens.quantization is not None
+    assert tens.quantization.scale_f32 is not None
+    assert tens.quantization.zero_point is not None
+
+    zero_point = tens.quantization.zero_point
+    quant_buf = tens.quant_values.astype(np.int64)
+
+    # Early zero-point correction
+    weights = quant_buf - zero_point
+
+    if len(weights.shape) == 2:
+        weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
+        weights_shape = (weights.shape[0], 1, 1, weights.shape[1])
+    else:
+        weights_shape = weights.shape
+
+    compression_scales = []
+    compressed_offsets = []
+    encoded_streams = []
+    offset = 0
+    max_single_buffer_len = 0
+
+    ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()
+    ifm_depth = weights.shape[-2]
+    if npu_block_type == NpuBlockType.ConvolutionDepthWise:
+        tens.block_traversal = TensorBlockTraversal.DepthWise
+    if npu_block_type == NpuBlockType.ConvolutionMxN:
+        # Determine which block traversal strategy has better DPU utilization
+        kernel_size = weights_shape[0] * weights_shape[1]
+        depth_utilization = weights_shape[2] / round_up(weights_shape[2], 32 if ifm_bitdepth == 8 else 16)
+        part_kernel_utilization = (weights_shape[2] / round_up(weights_shape[2], 8)) * (
+            kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
+        )
+        if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
+            # Part-kernel first is always better for ifm depths <= 8
+            tens.block_traversal = TensorBlockTraversal.PartKernelFirst
+        else:
+            tens.block_traversal = TensorBlockTraversal.DepthFirst
+
+    # Slice weight stream up depth-ways into bricks and compress
+    full_ofm_depth = quant_buf.shape[-1]
+    for idx in range(0, full_ofm_depth, ofm_depth_step):
+        # Get the weights necessary for this brick
+        count = min(full_ofm_depth - idx, ofm_depth_step)
+        brick_weights = weights[:, :, :, idx : idx + count]
+
+        # Encode all weights into one chunk
+        raw_stream = generate_brick(arch, brick_weights, ofm_block, tens.block_traversal, ifm_bitdepth)
+        encoded = encode(raw_stream)
+        encoded_streams.append(encoded)
+
+        # Remember maximum encoded length for DoubleBuffering
+        if max_single_buffer_len < len(encoded):
+            max_single_buffer_len = len(encoded)
+
+        # Remember where we put it for linear addressing
+        compressed_offsets.append(offset)
+        offset += len(encoded)
+        assert offset % 16 == 0
+
+        # Compression scale tracking
+        compression_scales.append(len(encoded) / len(raw_stream))
+
+    # Also track complete length in the offsets array
+    compressed_offsets.append(offset)
+
+    if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(encoded_streams) > 2:
+        offset = 2 * max_single_buffer_len
+        assert offset % 16 == 0
+
+    tens.storage_shape = [1, 1, 1, offset]
+    tens.weight_compression_scales = compression_scales
+    tens.weight_compression_config = WeightCompressionConfig(npu_block_type, ofm_block, ofm_depth_step)
+    tens.weight_compressed_offsets = compressed_offsets
+    tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales)
+    tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)
+    tens.compressed_values = encoded_streams
+    tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))
+
+
+def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
+    assert tens.purpose == TensorPurpose.FeatureMap
+    assert tens.format == TensorFormat.NHWC
+    # the connected operator should expect a bias input unless it is a FullyConnected
+    assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")
+    # the input bias tensor is the same as that connected to the operator
+    assert tens is tens.consumer_list[0].inputs[2]
+    # the operator should only have a single output
+    assert len(tens.consumer_list[0].outputs) == 1
+
+    def pack_bias_and_scale(bias, scale, shift):
+        bias = np.int64(bias)
+        assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1))  # signed 40-bit range
+        assert 0 <= scale < (1 << 32)  # unsigned 32-bit range
+        assert 0 <= shift < (1 << 6)  # unsigned 6-bit range
+
+        # pack the 80 bit value = [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
+        data = bytearray(10)
+        data[0] = (bias >> (0 * 8)) & 0xFF
+        data[1] = (bias >> (1 * 8)) & 0xFF
+        data[2] = (bias >> (2 * 8)) & 0xFF
+        data[3] = (bias >> (3 * 8)) & 0xFF
+        data[4] = (bias >> (4 * 8)) & 0xFF
+        data[5] = (scale >> (0 * 8)) & 0xFF
+        data[6] = (scale >> (1 * 8)) & 0xFF
+        data[7] = (scale >> (2 * 8)) & 0xFF
+        data[8] = (scale >> (3 * 8)) & 0xFF
+        data[9] = shift & 0x3F
+        return data
+
+    biases = tens.quant_values
+
+    first_consumer_op = tens.consumer_list[0]
+    ifm_dtype = first_consumer_op.inputs[0].dtype
+    ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32
+    ofm_scale = first_consumer_op.outputs[0].quantization.scale_f32
+    weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
+
+    # biases can have multiple consumers for rnn cells. if so, then check that they are all the same
+    for op in tens.consumer_list[1:]:
+        assert ifm_scale == op.inputs[0].quantization.scale_f32
+        assert ofm_scale == op.outputs[0].quantization.scale_f32
+        assert weight_scales == op.inputs[1].quantization.scale_f32
+
+    if not hasattr(weight_scales, "__iter__"):
+        # If weight_scales is not already an iterable make it into a list
+        weight_scales = [weight_scales]
+
+    # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
+    # uses double during scaling calculations
+    # TensorFlow Lite casts the scales slightly differently for uint8 and int8
+    if not rescale_for_faf:
+        if ifm_dtype == DataType.uint8:
+            scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
+        elif ifm_dtype == DataType.int8:
+            scales = [
+                (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
+                for weight_scale in weight_scales
+            ]
+        else:
+            assert False, str(ifm_dtype) + " not implemented"
+    else:
+        if ifm_dtype == DataType.uint8:
+            scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
+        elif ifm_dtype == DataType.int8:
+            scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
+        else:
+            assert False, str(ifm_dtype) + " not implemented"
+
+    # quantise all of the weight scales into (scale_factor, shift)
+    quantised_scales = [quantise_scale(scale) for scale in scales]
+
+    for _, shift in quantised_scales:
+        assert shift >= 16
+
+    # pack the biases and scales
+    tens.compressed_values = []
+    if len(quantised_scales) == 1:
+        # If only 1 quantised scale is used, repeat that value for the length of the biases
+        quantised_scales = [quantised_scales[0]] * len(biases)
+
+    assert len(quantised_scales) == len(biases)
+    for i, bias in enumerate(biases):
+        tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i]))
+
+    tens.element_size_bytes = 10
+
+    # Figure out if we need padded storage (extra whole elements)
+    padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16
+    if padding != 0:
+        padding = 16 - padding
+
+    # This adds enough padding to allow over-reads
+    while padding > 0:
+        tens.compressed_values.append(pack_bias_and_scale(0, 0, 0))
+        padding = padding - tens.element_size_bytes
+
+    tens.storage_shape = [len(tens.compressed_values)]
+
+
+def update_pass_weight_and_scale_tensors(nng, arch):
+    def find_npu_usage_of_tensor(tens):
+        # TODO: This function is identical to the one in mark_tensors.py. A common version should be used.
+        for op in tens.consumers():
+            if op.type == "DMA":
+                return find_npu_usage_of_tensor(op.outputs[0])
+            if "npu_block_type" in op.attrs:
+                return op.attrs["npu_block_type"]
+            return NpuBlockType.Default
+
+    for sg in nng.subgraphs:
+        for ps in sg.passes:
+            if ps.weight_tensor != None:
+                npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor)
+                if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
+                    ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2))
+                    ps.weight_tensor.shape = ps.weight_tensor.storage_shape = ps.weight_tensor.bandwidth_shape = list(
+                        ps.weight_tensor.quant_values.shape
+                    )
+                    ps.weight_tensor.weight_transpose_depthwise = True
+
+                needs_dma = len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA"
+                if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:
+                    ofm_depth_step = ps.block_config[-1]
+                else:
+                    ofm_depth_step = ps.weight_tensor.shape[-1]
+
+                compress_weights(
+                    ps.weight_tensor,
+                    arch,
+                    npu_usage_of_tensor,
+                    Block(ps.block_config[-3], ps.block_config[-4], ps.block_config[-1]),
+                    ofm_depth_step,
+                )
+                # Update source tensor
+                if len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA":
+                    src_tens = ps.weight_tensor.ops[0].inputs[0]
+                    src_tens.shape = ps.weight_tensor.shape
+                    src_tens.weight_transpose_depthwise = ps.weight_tensor.weight_transpose_depthwise
+                    src_tens.quant_values = ps.weight_tensor.quant_values
+                    src_tens.compressed_values = ps.weight_tensor.compressed_values
+                    src_tens.storage_shape = [1, 1, 1, ps.weight_tensor.weight_compressed_offsets[-1]]
+                    src_tens.brick_size = ps.weight_tensor.brick_size
+                    src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales
+                    src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets
+
+            if ps.scale_tensor != None:
+                rescale_for_faf = False
+                activation_ops = set(("Sigmoid", "Tanh"))
+                if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
+                    rescale_for_faf = True
+                calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1a1ae84
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Packaging for the Vela compiler
+
+from os import path
+from setuptools import setup, find_namespace_packages, Extension
+
+# Read the contents of README.md file
+this_directory = path.abspath(path.dirname(__file__))
+with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
+    long_description = f.read()
+
+mlw_module = Extension(
+    "ethosu.mlw_codec",
+    ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
+)
+
+setup(
+    name="ethos-u-vela",
+    use_scm_version=True,
+    description="Optimise TensorFlow Lite models for Ethos-U55 NPU.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git/",
+    author="Arm Ltd.",
+    license="Apache License 2.0",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: POSIX :: Linux",
+        "Programming Language :: C",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Compilers",
+    ],
+    keywords=["ethos-u", "vela compiler", "tflite", "npu"],
+    packages=find_namespace_packages(include=["ethosu.*"]),
+    python_requires="~=3.6",  # We support only 3.6+
+    install_requires=["flatbuffers==1.11.0", "numpy>=1.16.6"],
+    entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
+    ext_modules=[mlw_module],
+    setup_requires=["setuptools_scm"],
+)