Blame - ethosu/vela/softmax.py - ml/ethos-u/ethos-u-vela

blob: 8c980ad448b356ea13ebc556234a413b2ba5316e [file] [log] [blame]

Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	3	# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
				4	#
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	5	# SPDX-License-Identifier: Apache-2.0
				6	#
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	7	# Licensed under the Apache License, Version 2.0 (the "License");
				8	# you may not use this file except in compliance with the License.
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	9	# You may obtain a copy of the License at
				10	#
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	11	# http://www.apache.org/licenses/LICENSE-2.0
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	12	#
				13	# Unless required by applicable law or agreed to in writing, software
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	14	# distributed under the License is distributed on an "AS IS" BASIS,
				15	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	16	# See the License for the specific language governing permissions and
				17	# limitations under the License.
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	18	#
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	19	# Description:
				20	# Contains SoftMax
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	21	import math
				22
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	23	import numpy as np
				24
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	25	from . import fp_math
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	26	from . import scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	27	from .api import NpuRoundingMode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	28	from .data_type import DataType
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	29	from .debug_database import DebugDatabase
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	30	from .operation import ActivationFunction
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	31	from .operation import Op
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	32	from .operation import Operation
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	33	from .tensor import create_const_tensor
				34	from .tensor import create_reshape_tensor
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	35	from .tensor import Tensor
				36	from .tensor import TensorPurpose
				37
				38
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	39	class SoftMax:
				40	# Turn off black formatting for the LUT tables to keep them compact
				41	# fmt: off
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	42
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	43	EXP_LUT = [
				44	0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
				45	0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
				46	0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
				47	0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
				48	0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
				49	0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
				50	0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
				51	0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
				52	0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
				53	0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
				54	0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
				55	0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
				56	0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
				57	0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
				58	0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
				59	0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
				60	0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
				61	0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
				62	0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
				63	0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
				64	0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
				65	0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
				66	0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
				67	0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
				68	0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
				69	0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
				70	0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
				71	0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
				72	0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
				73	0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
				74	0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
				75	0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
				76	0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
				77	0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
				78	0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
				79	0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
				80	0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
				81	0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
				82	0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
				83	0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
				84	0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
				85	0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
				86	0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
				87	0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
				88	0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
				89	0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
				90	0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
				91	0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
				92	0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
				93	0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
				94	0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
				95	0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
				96	0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
				97	0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
				98	0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
				99	0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
				100	0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
				101	0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
				102	0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
				103	0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
				104	0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
				105	0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
				106	0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
				107	0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85,
				108	]
				109
				110	ONE_OVER_ONE_PLUS_X_LUT = [
				111	0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
				112	0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
				113	0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
				114	0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
				115	0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
				116	0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
				117	0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
				118	0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
				119	0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
				120	0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
				121	0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
				122	0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
				123	0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
				124	0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
				125	0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
				126	0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
				127	0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
				128	0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
				129	0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
				130	0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
				131	0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
				132	0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
				133	0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
				134	0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
				135	0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
				136	0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
				137	0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
				138	0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
				139	0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
				140	0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
				141	0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
				142	0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
				143	0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
				144	0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
				145	0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
				146	0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
				147	0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
				148	0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
				149	0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
				150	0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
				151	0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
				152	0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
				153	0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
				154	0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
				155	0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
				156	0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
				157	0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
				158	0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
				159	0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
				160	0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
				161	0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
				162	0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
				163	0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
				164	0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
				165	0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
				166	0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
				167	0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
				168	0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
				169	0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
				170	0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
				171	0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
				172	0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
				173	0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
				174	0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
				175	]
				176	# fmt: on
				177
				178	def __init__(self, op):
				179	self.op = op
				180
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	181	def generate_exp_table(self, beta, input_scale):
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	182	integer_bits = 5
				183	total_signed_bits = 31
				184	# Calculate scaling
				185	real_beta = min(
				186	np.double(beta) * np.double(input_scale) * (1 << (31 - integer_bits)), np.double((1 << 31) - 1.0)
				187	)
				188	scale, shift = scaling.quantise_scale(real_beta)
				189	shift = 31 - shift
				190	diff_min = -1.0 * math.floor(
				191	1.0 * ((1 << integer_bits) - 1) * (1 << (total_signed_bits - integer_bits)) / (1 << shift)
				192	)
				193	# Generate the exp LUT
				194	lut = []
				195	for x in range(256):
				196	input_diff = x - 255
				197	if input_diff >= diff_min:
				198	rescale = fp_math.saturating_rounding_mul(input_diff * (1 << shift), scale)
				199	lut.append(fp_math.exp_on_negative_values(rescale))
				200	else:
				201	lut.append(0)
				202	return lut
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	203
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	204	def get_graph(self):
				205	ifm = self.op.inputs[0]
				206	ofm = self.op.outputs[0]
				207
Fredrik Svedberg	835d8e1	2020-09-04 09:46:17 +0200	[diff] [blame]	208	# Reshape ifm/ofm (if needed)
				209	full_shape = ifm.get_full_shape()
				210	if full_shape[0] > 1:
				211	full_shape[1] *= full_shape[0]
				212	full_shape[0] = 1
				213	ifm = create_reshape_tensor(ifm, full_shape)
				214	ofm = create_reshape_tensor(ofm, full_shape, False)
				215
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	216	if ifm.dtype in (DataType.uint8, DataType.int8) and ofm.dtype == ifm.dtype:
				217	return self.get_graph_8bit(ifm, ofm)
				218	elif ifm.dtype == DataType.int16 and ofm.dtype == DataType.int16:
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	219	return self.get_graph_int16(ifm, ofm)
				220	else:
				221	self.op.run_on_npu = False
				222	return self.op
				223
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	224	def get_graph_8bit(self, ifm, ofm):
				225	exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	226	ifm = create_reshape_tensor(ifm, ifm.get_full_shape())
				227	DebugDatabase.add_optimised(self.op, ifm.ops[0])
				228	ofm = create_reshape_tensor(ofm, ofm.get_full_shape(), False)
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	229	no_scale_quant = ifm.quantization.clone()
				230	no_scale_quant.scale_f32 = None
				231	no_scale_quant.zero_point = 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	232	activation = ActivationFunction(Op.Clip)
				233	activation.min = ifm.quantization.quant_min
				234	activation.max = ifm.quantization.quant_max
				235	activation2 = activation.clone()
				236	activation2.min = 2 * ifm.quantization.quant_min
				237	activation2.max = 2 * ifm.quantization.quant_max
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	238	one_scale_quant = ifm.quantization.clone()
				239	one_scale_quant.scale_f32 = 1.0
				240	one_scale_quant.zero_point = 0
				241	ifm.quantization.zero_point = 0
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	242	pass_number = 0
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	243
				244	# PASS 0 - Depthwise Maxpool
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	245	maxpool_op = self.op.clone(f"_maxpool{pass_number}")
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	246	maxpool_op.type = Op.MaxPool
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	247	maxpool_h = ifm.shape[1] * ifm.shape[2]
				248	maxpool_w = ifm.shape[3]
				249	maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
				250	maxpool_op.attrs["padding"] = b"VALID"
				251	maxpool_op.attrs["stride_w"] = 1
				252	maxpool_op.attrs["stride_h"] = 1
				253	maxpool_op.attrs["filter_width"] = maxpool_w
				254	maxpool_op.attrs["filter_height"] = 1
				255	maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1]
				256	maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1]
				257	maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)]
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	258	ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	259	ifm_max.quantization = no_scale_quant
				260	maxpool_op.set_output_tensor(ifm_max)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	261	DebugDatabase.add_optimised(self.op, maxpool_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	262	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	263
				264	# PASS 1 - Sub+LUT(exp)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	265	sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	266	sub_op.add_input_tensor(ifm)
Fredrik Svedberg	835d8e1	2020-09-04 09:46:17 +0200	[diff] [blame]	267	sub_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]))
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	268	sub_op.set_activation_lut(
				269	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	270	f"{sub_op.name}_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	271	)
				272	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	273	ifm_exp = Tensor(ifm.shape, DataType.int32, f"{sub_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	274	ifm_exp.quantization = one_scale_quant.clone()
				275	ifm_exp.quantization.zero_point = 127
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	276	sub_op.activation = ActivationFunction(Op.LUT)
				277	# Note: activation.min/max are non-quantized values
				278	sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
				279	sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	280	sub_op.set_output_tensor(ifm_exp)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	281	DebugDatabase.add_optimised(self.op, sub_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	282	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	283
				284	# PASS 2 - SHR
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	285	shr2_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	286	shr2_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	287	shr2_op.add_input_tensor(ifm_exp)
				288	shr2_op.add_input_tensor(
				289	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	290	f"{shr2_op.name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	291	),
				292	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	293	shr2_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	294	rescaled_exp = Tensor(ifm.shape, ifm_exp.dtype, f"{shr2_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	295	rescaled_exp.quantization = no_scale_quant
				296	shr2_op.set_output_tensor(rescaled_exp)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	297	DebugDatabase.add_optimised(self.op, shr2_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	298	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	299
				300	# PASS 3 - Reduce sum
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	301	reduce_sum_op = Operation(Op.ReduceSum, f"{self.op.name}_reduce_sum3")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	302	reduce_sum_op.attrs["padding"] = b"VALID"
				303	reduce_sum_op.attrs["stride_w"] = 1
				304	reduce_sum_op.attrs["stride_h"] = 1
				305	reduce_sum_op.attrs["filter_width"] = 1
				306	reduce_sum_op.attrs["filter_height"] = 1
				307	reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1]
				308	reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1]
				309	reduce_sum_op.add_input_tensor(rescaled_exp)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	310	reduce_sum_op.activation = activation.clone()
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	311
				312	reduce_sum_shape = [1, rescaled_exp.shape[1], rescaled_exp.shape[2], 1]
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	313	sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	314	sum_of_exp.quantization = no_scale_quant
				315	reduce_sum_op.set_output_tensor(sum_of_exp)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	316	DebugDatabase.add_optimised(self.op, reduce_sum_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	317	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	318
				319	# PASS 4 - CLZ
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	320	clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	321	clz_op.add_input_tensor(sum_of_exp)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	322	clz_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	323	headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	324	headroom_plus_one.quantization = no_scale_quant
				325	clz_op.set_output_tensor(headroom_plus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	326	DebugDatabase.add_optimised(self.op, clz_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	327	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	328
				329	# PASS 5 - Sub
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	330	sub5_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	331	sub5_op.add_input_tensor(
				332	create_const_tensor(
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	333	"headroom_offset_const",
				334	[1, 1, 1, 1],
				335	DataType.int32,
				336	[12 + 31 - 8],
				337	np.int32,
				338	quantization=no_scale_quant,
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	339	),
				340	)
				341	sub5_op.add_input_tensor(headroom_plus_one)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	342	sub5_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	343	right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub5_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	344	right_shift.quantization = no_scale_quant
				345	sub5_op.set_output_tensor(right_shift)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	346	DebugDatabase.add_optimised(self.op, sub5_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	347	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	348
				349	# PASS 6 - Sub
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	350	one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	351	sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	352	sub6_op.add_input_tensor(headroom_plus_one)
				353	sub6_op.add_input_tensor(one)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	354	sub6_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	355	headroom = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	356	headroom.quantization = no_scale_quant
				357	sub6_op.set_output_tensor(headroom)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	358	DebugDatabase.add_optimised(self.op, sub6_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	359	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	360
				361	# PASS 7 - SHL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	362	shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	363	shl7_op.add_input_tensor(sum_of_exp)
				364	shl7_op.add_input_tensor(headroom)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	365	shl7_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	366	shifted_sum = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	367	shifted_sum.quantization = no_scale_quant
				368	shl7_op.set_output_tensor(shifted_sum)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	369	DebugDatabase.add_optimised(self.op, shl7_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	370	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	371
				372	# PASS 8 - Sub
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	373	sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	374	sub8_op.add_input_tensor(shifted_sum)
				375	sub8_op.add_input_tensor(
				376	create_const_tensor(
				377	"shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
				378	),
				379	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	380	sub8_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	381	shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	382	shifted_sum_minus_one.quantization = no_scale_quant
				383	sub8_op.set_output_tensor(shifted_sum_minus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	384	DebugDatabase.add_optimised(self.op, sub8_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	385	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	386
				387	# PASS 9 - SHL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	388	shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	389	shl9_op.add_input_tensor(shifted_sum_minus_one)
				390	shl9_op.add_input_tensor(one)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	391	shl9_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	392	shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	393	shifted_sum_minus_one.quantization = no_scale_quant
				394	shl9_op.set_output_tensor(shifted_sum_minus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	395	DebugDatabase.add_optimised(self.op, shl9_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	396	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	397
				398	# PASS 10 - Add
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	399	add10_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	400	add10_op.add_input_tensor(
				401	create_const_tensor(
				402	"F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
				403	),
				404	)
				405	add10_op.add_input_tensor(shifted_sum_minus_one)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	406	add10_op.activation = activation.clone()
				407	add10_op.attrs["rescale"] = (1, 1)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	408	half_denominator = Tensor(sum_of_exp.shape, DataType.int32, f"{add10_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	409	half_denominator.quantization = one_scale_quant
				410	add10_op.set_output_tensor(half_denominator)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	411	DebugDatabase.add_optimised(self.op, add10_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	412	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	413
				414	# PASS 11 - Multiply
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	415	mul11_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	416	mul11_op.add_input_tensor(half_denominator)
				417	mul11_op.add_input_tensor(
				418	create_const_tensor(
Fredrik Svedberg	1575b94	2020-08-18 13:19:18 +0200	[diff] [blame]	419	"neg_32_over_17_const",
				420	[1, 1, 1, 1],
				421	DataType.int32,
				422	[-1010580540],
				423	np.int32,
				424	quantization=one_scale_quant,
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	425	),
				426	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	427	rescaled = Tensor(sum_of_exp.shape, DataType.int32, f"{mul11_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	428	rescaled.quantization = one_scale_quant.clone()
				429	rescaled.quantization.scale_f32 = 2.0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	430	mul11_op.activation = activation2.clone()
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	431	mul11_op.set_output_tensor(rescaled)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	432	DebugDatabase.add_optimised(self.op, mul11_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	433	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	434
				435	# PASS 12 - Add
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	436	add12_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	437	add12_op.add_input_tensor(rescaled)
				438	add12_op.add_input_tensor(
				439	create_const_tensor(
				440	"48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
				441	),
				442	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	443	add12_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	444	rescale_w_offset = Tensor(sum_of_exp.shape, DataType.int32, f"{add12_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	445	rescale_w_offset.quantization = one_scale_quant
				446	add12_op.set_output_tensor(rescale_w_offset)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	447	DebugDatabase.add_optimised(self.op, add12_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	448	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	449
				450	nr_x = rescale_w_offset
				451	F2_one = create_const_tensor(
				452	"F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant
				453	)
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	454	four = create_const_tensor(
				455	"four_const", [1, 1, 1, 1], DataType.int32, [4], np.int32, quantization=no_scale_quant
				456	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	457	for _ in range(3):
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	458	# PASS 13, 18, 23 - MUL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	459	mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	460	mul_op.add_input_tensor(nr_x)
				461	mul_op.add_input_tensor(half_denominator)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	462	mul_op.activation = activation2.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	463	half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	464	half_denominator_times_x.quantization = one_scale_quant.clone()
				465	half_denominator_times_x.quantization.scale_f32 = 2.0
				466	mul_op.set_output_tensor(half_denominator_times_x)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	467	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	468	# PASS 14, 19, 24 - SUB
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	469	sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	470	sub_op.add_input_tensor(F2_one)
				471	sub_op.add_input_tensor(half_denominator_times_x)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	472	sub_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	473	one_minus_half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{sub_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	474	one_minus_half_denominator_times_x.quantization = one_scale_quant
				475	sub_op.set_output_tensor(one_minus_half_denominator_times_x)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	476	DebugDatabase.add_optimised(self.op, sub_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	477	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	478	# PASS 15, 20, 25 - MUL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	479	mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	480	mul_op.add_input_tensor(nr_x)
				481	mul_op.add_input_tensor(one_minus_half_denominator_times_x)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	482	mul_op.activation = activation2.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	483	to_rescale = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	484	to_rescale.quantization = one_scale_quant.clone()
				485	to_rescale.quantization.scale_f32 = 2.0
				486	mul_op.set_output_tensor(to_rescale)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	487	pass_number += 1
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	488	# PASS 16, 21, 26 - MUL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	489	shl_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	490	shl_op.add_input_tensor(to_rescale)
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	491	shl_op.add_input_tensor(four)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	492	shl_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	493	to_add = Tensor(sum_of_exp.shape, DataType.int32, f"{shl_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	494	to_add.quantization = no_scale_quant
				495	shl_op.set_output_tensor(to_add)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	496	DebugDatabase.add_optimised(self.op, shl_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	497	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	498	# PASS 17, 22, 27 - ADD
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	499	add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	500	add_op.add_input_tensor(nr_x)
				501	add_op.add_input_tensor(to_add)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	502	add_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	503	nr_x = Tensor(sum_of_exp.shape, DataType.int32, f"{add_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	504	nr_x.quantization = one_scale_quant
				505	add_op.set_output_tensor(nr_x)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	506	DebugDatabase.add_optimised(self.op, add_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	507	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	508
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	509	# PASS 28 - Multiply
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	510	mul28_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	511	mul28_op.add_input_tensor(nr_x)
				512	mul28_op.add_input_tensor(
				513	create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
				514	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	515	mul28_op.activation = activation.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	516	scale_factor = Tensor(sum_of_exp.shape, DataType.int32, f"{mul28_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	517	scale_factor.quantization = one_scale_quant
Fredrik Svedberg	880e735	2020-08-25 11:31:47 +0200	[diff] [blame]	518	mul28_op.set_output_tensor(scale_factor)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	519	DebugDatabase.add_optimised(self.op, mul28_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	520	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	521
				522	# PASS 29 - Multiply
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	523	mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	524	mul_op.add_input_tensor(ifm_exp)
				525	mul_op.add_input_tensor(scale_factor)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	526	mul_op.activation = activation2.clone()
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	527	scaled_exp = Tensor(ifm_exp.shape, DataType.int32, f"{mul_op.name}_0")
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	528	scaled_exp.quantization = one_scale_quant.clone()
				529	scaled_exp.quantization.scale_f32 = 2.0
				530	mul_op.set_output_tensor(scaled_exp)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	531	DebugDatabase.add_optimised(self.op, mul_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	532	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	533
				534	# PASS 30 - SHR
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	535	shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	536	shr30_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	537	shr30_op.add_input_tensor(scaled_exp)
				538	shr30_op.add_input_tensor(right_shift)
				539	shr30_op.set_output_tensor(ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	540	DebugDatabase.add_optimised(self.op, shr30_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	541	pass_number += 1
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	542
				543	return shr30_op
				544
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	545	def get_graph_int16(self, ifm, ofm):
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	546	no_scale_quant = ifm.quantization.clone()
				547	no_scale_quant.scale_f32 = None
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	548	pass_number = 0
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	549
				550	# PASS 0 - Depthwise Maxpool
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	551	maxpool_op = self.op.clone(f"_maxpool{pass_number}")
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	552	maxpool_op.type = Op.MaxPool
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	553	DebugDatabase.add_optimised(self.op, maxpool_op)
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	554	maxpool_h = ifm.shape[1] * ifm.shape[2]
				555	maxpool_w = ifm.shape[3]
				556	maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
				557	maxpool_op.attrs["padding"] = b"VALID"
				558	maxpool_op.attrs["stride_w"] = 1
				559	maxpool_op.attrs["stride_h"] = 1
				560	maxpool_op.attrs["filter_width"] = maxpool_w
				561	maxpool_op.attrs["filter_height"] = 1
				562	maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1]
				563	maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1]
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	564	maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)]
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	565	ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0")
				566	ifm_max.quantization = no_scale_quant
				567	maxpool_op.set_output_tensor(ifm_max)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	568	DebugDatabase.add_optimised(self.op, maxpool_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	569	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	570
				571	# PASS 1 - Sub
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	572	sub1_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	573	sub1_op.add_input_tensor(ifm)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	574	sub1_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]))
				575	sub1_ofm = Tensor(ifm.shape, DataType.int32, f"{sub1_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	576	sub1_ofm.quantization = ifm.quantization.clone()
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	577	sub1_op.set_output_tensor(sub1_ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	578	DebugDatabase.add_optimised(self.op, sub1_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	579	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	580
				581	# PASS 2 - Mul
				582	beta = self.op.attrs.get("beta", 1.0)
				583	mul2_out_range = 10.0 / 65535.0
				584	mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
				585	mul2_quant = ifm.quantization.clone()
				586	mul2_quant.scale_f32 = beta
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	587	mul2_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	588	mul2_op.add_input_tensor(sub1_ofm)
				589	mul2_op.add_input_tensor(
				590	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	591	f"{mul2_op.name}_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=mul2_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	592	),
				593	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	594	mul2_ofm = Tensor(ifm.shape, DataType.int32, f"{self.op.name}_mul{pass_number}")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	595	mul2_ofm.quantization = ofm.quantization.clone()
				596	mul2_ofm.quantization.scale_f32 = mul2_out_range
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	597	mul2_op.set_output_tensor(mul2_ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	598	DebugDatabase.add_optimised(self.op, mul2_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	599	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	600
				601	# PASS 3 - Add+LUT(exp)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	602	add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	603	add_op.add_input_tensor(mul2_ofm)
				604	add_op.add_input_tensor(
				605	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	606	f"{add_op.name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	607	),
				608	)
				609	add_op.set_activation_lut(
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	610	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	611	f"{add_op.name}_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	612	)
				613	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	614	exp_ofm = Tensor(mul2_ofm.shape, DataType.int16, f"{add_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	615	exp_ofm.quantization = mul2_ofm.quantization.clone()
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	616	add_op.set_output_tensor(exp_ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	617	DebugDatabase.add_optimised(self.op, add_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	618	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	619
				620	# PASS 4 - Reduce sum
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	621	reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	622	reduce_sum_op.attrs["padding"] = b"VALID"
				623	reduce_sum_op.attrs["stride_w"] = 1
				624	reduce_sum_op.attrs["stride_h"] = 1
				625	reduce_sum_op.attrs["filter_width"] = 1
				626	reduce_sum_op.attrs["filter_height"] = 1
				627	reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1]
				628	reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1]
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	629	reduce_sum_op.add_input_tensor(exp_ofm)
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	630
				631	reduce_sum_shape = [1, exp_ofm.shape[1], exp_ofm.shape[2], 1]
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	632	sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	633	sum_of_exp.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	634	reduce_sum_op.set_output_tensor(sum_of_exp)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	635	DebugDatabase.add_optimised(self.op, reduce_sum_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	636	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	637
				638	# PASS 5 - CLZ
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	639	clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	640	clz_op.add_input_tensor(sum_of_exp)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	641	headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	642	headroom_plus_one.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	643	clz_op.set_output_tensor(headroom_plus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	644	DebugDatabase.add_optimised(self.op, clz_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	645	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	646
				647	# PASS 6 - Sub
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	648	sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	649	sub6_op.add_input_tensor(
				650	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	651	f"{sub6_op.name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	652	),
				653	)
Jacob Bohlin	be733cf	2020-08-13 10:21:34 +0200	[diff] [blame]	654	sub6_op.add_input_tensor(headroom_plus_one)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	655	reciprocal_right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	656	reciprocal_right_shift.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	657	sub6_op.set_output_tensor(reciprocal_right_shift)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	658	DebugDatabase.add_optimised(self.op, sub6_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	659	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	660
				661	# PASS 7 - SHL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	662	shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	663	shl7_op.add_input_tensor(
				664	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	665	f"{shl7_op.name}_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	666	),
				667	)
Jacob Bohlin	be733cf	2020-08-13 10:21:34 +0200	[diff] [blame]	668	shl7_op.add_input_tensor(reciprocal_right_shift)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	669	constant_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	670	constant_one.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	671	shl7_op.set_output_tensor(constant_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	672	DebugDatabase.add_optimised(self.op, shl7_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	673	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	674
				675	# PASS 8 - Sub
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	676	sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	677	sub8_op.add_input_tensor(sum_of_exp)
				678	sub8_op.add_input_tensor(constant_one)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	679	sum_of_exps_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	680	sum_of_exps_minus_one.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	681	sub8_op.set_output_tensor(sum_of_exps_minus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	682	DebugDatabase.add_optimised(self.op, sub8_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	683	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	684
				685	# PASS 9 - SHL
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	686	shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	687	shl9_op.add_input_tensor(sum_of_exps_minus_one)
				688	shl9_op.add_input_tensor(headroom_plus_one)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	689	shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	690	shifted_sum_minus_one.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	691	shl9_op.set_output_tensor(shifted_sum_minus_one)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	692	DebugDatabase.add_optimised(self.op, shl9_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	693	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	694
				695	# PASS 10 - SHR
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	696	shr10_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	697	shr10_op.add_input_tensor(shifted_sum_minus_one)
				698	shr10_op.add_input_tensor(
				699	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	700	f"{shr10_op.name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	701	),
				702	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	703	shifted_sum_minus_one_16 = Tensor(sum_of_exp.shape, DataType.int32, f"{shr10_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	704	shifted_sum_minus_one_16.quantization = shifted_sum_minus_one.quantization.clone()
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	705	shr10_op.set_output_tensor(shifted_sum_minus_one_16)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	706	DebugDatabase.add_optimised(self.op, shr10_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	707	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	708
				709	# PASS 11 - Sub+LUT(one over one plus x)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	710	sub11_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	711	sub11_op.add_input_tensor(shifted_sum_minus_one_16)
				712	sub11_op.add_input_tensor(
				713	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	714	f"{sub11_op.name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	715	),
				716	)
				717	sub11_op.set_activation_lut(
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	718	create_const_tensor(
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	719	f"{sub11_op.name}_lut",
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	720	[1, 1, 1, 512],
				721	DataType.int32,
				722	self.ONE_OVER_ONE_PLUS_X_LUT,
Fredrik Svedberg	597fd3f	2020-08-13 10:02:53 +0200	[diff] [blame]	723	np.int32,
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	724	TensorPurpose.LUT,
				725	)
				726	)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	727	reciprocal_scale = Tensor(sum_of_exp.shape, DataType.int16, f"{sub11_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	728	reciprocal_scale.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	729	sub11_op.set_output_tensor(reciprocal_scale)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	730	DebugDatabase.add_optimised(self.op, sub11_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	731	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	732
				733	# PASS 12 - Multiply
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	734	mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	735	mul_op.add_input_tensor(exp_ofm)
				736	mul_op.add_input_tensor(reciprocal_scale)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	737	mul_ofm = Tensor(exp_ofm.shape, DataType.int32, f"{mul_op.name}_0")
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	738	mul_ofm.quantization = no_scale_quant
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	739	mul_op.set_output_tensor(mul_ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	740	DebugDatabase.add_optimised(self.op, mul_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	741	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	742
				743	# PASS 13 - SHR
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	744	shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh	5778ffd	2020-08-06 17:31:02 +0100	[diff] [blame]	745	shr13_op.add_input_tensor(mul_ofm)
				746	shr13_op.add_input_tensor(reciprocal_right_shift)
				747	shr13_op.set_output_tensor(ofm)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	748	DebugDatabase.add_optimised(self.op, shr13_op)
Fredrik Svedberg	32c7f5b	2020-12-02 09:24:29 +0100	[diff] [blame^]	749	pass_number += 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	750
				751	return shr13_op