[MLBEDSW-297] Setup and run on Microsoft Windows

Various updates to make vela run and produce identical output on
Microsoft Windows.
* Fixed overflow errors
* Fixed compile warnings
* Avoid problematic numpy version
* Updated README.md

Signed-off-by: Fredrik Svedberg <Fredrik.Svedberg@arm.com>
Change-Id: Ie48c63a92a00c81b3247d07f05b75d881319ddbb
diff --git a/ethosu/mlw_codec/mlw_codecmodule.c b/ethosu/mlw_codec/mlw_codecmodule.c
index de945ab..6dde12d 100644
--- a/ethosu/mlw_codec/mlw_codecmodule.c
+++ b/ethosu/mlw_codec/mlw_codecmodule.c
@@ -53,7 +53,7 @@
     return NULL;
 
   /* Unpack the length of the input integer list.  */
-  int input_length = PyObject_Length (input_list_object);
+  int input_length = (int)PyObject_Length (input_list_object);
   if (input_length < 0)
     input_length = 0;
 
@@ -73,7 +73,7 @@
       item = PyList_GetItem(input_list_object, i);
       if (!PyLong_Check(item))
         input_buffer[i] = 0;
-      input_buffer[i] = PyLong_AsLong(item);
+      input_buffer[i] = (int16_t)PyLong_AsLong(item);
     }
 
   /* We don't know the output length required, we guess worst case,
@@ -126,7 +126,7 @@
 
   /* Unpack the input buffer and length from the bytearray object.  */
   uint8_t *input_buffer = (uint8_t *) PyByteArray_AsString(input_bytearray_object);
-  int input_length = PyByteArray_Size(input_bytearray_object);
+  int input_length = (int)PyByteArray_Size(input_bytearray_object);
 
   /* We don't know the output length required, we guess, but the guess
    * will be too small, the mlw_decode call will do a resize (upwards)
diff --git a/ethosu/mlw_codec/mlw_encode.c b/ethosu/mlw_codec/mlw_encode.c
index 7820106..04afa3e 100644
--- a/ethosu/mlw_codec/mlw_encode.c
+++ b/ethosu/mlw_codec/mlw_encode.c
@@ -33,8 +33,12 @@
 
 #define ZERO_RUN_THRES  4
 
+#ifndef min
 #define min(a,b) ((a)<(b)?(a):(b))
+#endif
+#ifndef max
 #define max(a,b) ((a)>(b)?(a):(b))
+#endif
 
 typedef struct palette {
     int16_t lut[32];
@@ -258,7 +262,7 @@
     // Setup the 32 entry palette
     int palette_max_val = 0, val, cnt, pal_cnt=0;
     for(i=0; i<max_palette_size; i++) {
-        cnt = freq64[i]>>16;
+        cnt = (int)(freq64[i]>>16);
         val = freq64[i]&0xffff;
         if ( cnt==0 )
             break;
diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp
index ce5c46d..c7c418a 100644
--- a/ethosu/tensor_allocator/search_allocator.cpp
+++ b/ethosu/tensor_allocator/search_allocator.cpp
@@ -31,7 +31,7 @@
     uint32_t max_end_time = 0;
     for (size_t i = 0; i < lrs.size(); ++i) {
         auto &lr = lrs[i];
-        lr.id = i;
+        lr.id = static_cast<int>(i);
         max_end_time = std::max(max_end_time, lr.end_time);
     }
     lrs_at_time.resize(max_end_time + 1);
diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
index 79ee95a..02488ad 100644
--- a/ethosu/tensor_allocator/tensor_allocatormodule.cpp
+++ b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
@@ -53,7 +53,7 @@
     }
 
     /* Unpack the length of the input integer list. */
-    int input_length = PyObject_Length (input_list_object);
+    int input_length = static_cast<int>(PyObject_Length (input_list_object));
     if (input_length < 0) {
         input_length = 0;
     }
diff --git a/ethosu/vela/fp_math.py b/ethosu/vela/fp_math.py
index 6637561..5228f03 100644
--- a/ethosu/vela/fp_math.py
+++ b/ethosu/vela/fp_math.py
@@ -41,7 +41,7 @@
     if a == b and a == np.iinfo(np.int32).min:
         return np.int32(np.iinfo(np.int32).max)
     divider = 1 << 31
-    ab = a * b
+    ab = np.int64(a) * np.int64(b)
     if ab >= 0:
         nudge = 1 << 30
         return (ab + nudge) // divider
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 1bdab74..8b06129 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -520,7 +520,7 @@
                 [1, 1, 1, 512],
                 DataType.int32,
                 self.ONE_OVER_ONE_PLUS_X_LUT,
-                np.int32,
+                np.uint32,
                 TensorPurpose.LUT,
             )
         )