blob: 8f30fa14578723611cc2d9f0f0c3e2279af3be51 [file] [log] [blame]
Tim Hall3b1578e2023-01-13 17:57:25 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Fredrik Svedberga0c36242020-06-03 15:43:31 +02002#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02003# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4#
Fredrik Svedberga0c36242020-06-03 15:43:31 +02005# SPDX-License-Identifier: Apache-2.0
6#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02007# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
Fredrik Svedberga0c36242020-06-03 15:43:31 +02009# You may obtain a copy of the License at
10#
Fredrik Svedberg1575b942020-08-18 13:19:18 +020011# http://www.apache.org/licenses/LICENSE-2.0
Fredrik Svedberga0c36242020-06-03 15:43:31 +020012#
13# Unless required by applicable law or agreed to in writing, software
Fredrik Svedberg1575b942020-08-18 13:19:18 +020014# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Fredrik Svedberga0c36242020-06-03 15:43:31 +020016# See the License for the specific language governing permissions and
17# limitations under the License.
Fredrik Svedberg1575b942020-08-18 13:19:18 +020018#
Fredrik Svedberga0c36242020-06-03 15:43:31 +020019# Description:
20# Contains SoftMax
Fredrik Svedberg1575b942020-08-18 13:19:18 +020021import math
22
Fredrik Svedberga0c36242020-06-03 15:43:31 +020023import numpy as np
24
Fredrik Svedberg1575b942020-08-18 13:19:18 +020025from . import fp_math
Fredrik Svedberga0c36242020-06-03 15:43:31 +020026from . import scaling
27from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000028from .debug_database import DebugDatabase
Louis Verhaarde8a5a782020-11-02 18:04:27 +010029from .operation import ActivationFunction
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +020030from .operation import ExplicitScaling
Louis Verhaardaee5d752020-09-30 09:01:52 +020031from .operation import Op
Fredrik Svedberga0c36242020-06-03 15:43:31 +020032from .operation import Operation
Tim Hall5ff4cd12023-05-16 22:39:14 +010033from .operation import RoundingMode
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010034from .operation_util import create_add
35from .operation_util import create_clz
36from .operation_util import create_depthwise_maxpool
37from .operation_util import create_mul
38from .operation_util import create_reduce_sum
39from .operation_util import create_shl
40from .operation_util import create_shr
41from .operation_util import create_sub
Patrik Gustavsson3a269202021-01-21 08:28:55 +010042from .shape4d import Shape4D
Michael McGeagh5778ffd2020-08-06 17:31:02 +010043from .tensor import create_const_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020044from .tensor import TensorPurpose
45
46
Fredrik Svedberga0c36242020-06-03 15:43:31 +020047class SoftMax:
48 # Turn off black formatting for the LUT tables to keep them compact
49 # fmt: off
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +020050
Fredrik Svedberga0c36242020-06-03 15:43:31 +020051 EXP_LUT = [
52 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
53 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
54 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
55 0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
56 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
57 0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
58 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
59 0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
60 0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
61 0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
62 0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
63 0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
64 0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
65 0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
66 0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
67 0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
68 0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
69 0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
70 0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
71 0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
72 0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
73 0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
74 0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
75 0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
76 0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
77 0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
78 0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
79 0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
80 0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
81 0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
82 0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
83 0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
84 0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
85 0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
86 0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
87 0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
88 0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
89 0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
90 0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
91 0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
92 0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
93 0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
94 0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
95 0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
96 0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
97 0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
98 0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
99 0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
100 0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
101 0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
102 0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
103 0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
104 0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
105 0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
106 0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
107 0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
108 0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
109 0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
110 0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
111 0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
112 0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
113 0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
114 0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
115 0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85,
116 ]
117
118 ONE_OVER_ONE_PLUS_X_LUT = [
119 0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
120 0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
121 0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
122 0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
123 0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
124 0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
125 0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
126 0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
127 0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
128 0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
129 0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
130 0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
131 0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
132 0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
133 0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
134 0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
135 0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
136 0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
137 0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
138 0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
139 0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
140 0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
141 0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
142 0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
143 0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
144 0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
145 0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
146 0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
147 0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
148 0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
149 0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
150 0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
151 0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
152 0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
153 0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
154 0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
155 0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
156 0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
157 0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
158 0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
159 0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
160 0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
161 0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
162 0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
163 0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
164 0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
165 0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
166 0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
167 0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
168 0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
169 0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
170 0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
171 0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
172 0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
173 0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
174 0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
175 0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
176 0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
177 0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
178 0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
179 0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
180 0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
181 0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
182 0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
183 ]
184 # fmt: on
185
186 def __init__(self, op):
187 self.op = op
188
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200189 def generate_exp_table(self, beta, input_scale):
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200190 integer_bits = 5
191 total_signed_bits = 31
192 # Calculate scaling
193 real_beta = min(
194 np.double(beta) * np.double(input_scale) * (1 << (31 - integer_bits)), np.double((1 << 31) - 1.0)
195 )
196 scale, shift = scaling.quantise_scale(real_beta)
197 shift = 31 - shift
198 diff_min = -1.0 * math.floor(
199 1.0 * ((1 << integer_bits) - 1) * (1 << (total_signed_bits - integer_bits)) / (1 << shift)
200 )
201 # Generate the exp LUT
202 lut = []
203 for x in range(256):
204 input_diff = x - 255
205 if input_diff >= diff_min:
Diqing Zhong189f7482021-01-26 12:12:51 +0100206 rescale = fp_math.saturating_rounding_mul32(input_diff * (1 << shift), scale)
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200207 lut.append(fp_math.exp_on_negative_values(rescale))
208 else:
209 lut.append(0)
210 return lut
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200211
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200212 def get_graph(self):
213 ifm = self.op.inputs[0]
214 ofm = self.op.outputs[0]
215
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200216 # Reshape ifm/ofm (if needed)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100217 ifm_shape = self.op.ifm_shapes[0]
218 if ifm_shape.batch > 1:
Tim Hall73e843f2021-02-04 22:47:46 +0000219 self.op.ifm_shapes[0] = ifm_shape.with_height(ifm_shape.batch * ifm_shape.height).with_batch(1)
Tim Hall73e843f2021-02-04 22:47:46 +0000220 self.op.ofm_shapes[0] = self.op.ifm_shapes[0]
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200221
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200222 if ifm.dtype in (DataType.uint8, DataType.int8) and ofm.dtype == ifm.dtype:
223 return self.get_graph_8bit(ifm, ofm)
224 elif ifm.dtype == DataType.int16 and ofm.dtype == DataType.int16:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200225 return self.get_graph_int16(ifm, ofm)
226 else:
227 self.op.run_on_npu = False
228 return self.op
229
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200230 def get_graph_8bit(self, ifm, ofm):
231 exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200232 no_scale_quant = ifm.quantization.clone()
233 no_scale_quant.scale_f32 = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100234 activation = ActivationFunction(Op.Clip)
235 activation.min = ifm.quantization.quant_min
236 activation.max = ifm.quantization.quant_max
237 activation2 = activation.clone()
238 activation2.min = 2 * ifm.quantization.quant_min
239 activation2.max = 2 * ifm.quantization.quant_max
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200240 one_scale_quant = ifm.quantization.clone()
241 one_scale_quant.scale_f32 = 1.0
242 one_scale_quant.zero_point = 0
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100243 two_scale_quant = one_scale_quant.clone()
244 two_scale_quant.scale_f32 = 2.0
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100245 pass_number = 0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200246
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100247 def add_op_get_ofm(op):
248 DebugDatabase.add_optimised(self.op, op)
249 nonlocal pass_number
250 pass_number += 1
251 return op.ofm
252
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200253 # PASS 0 - Depthwise Maxpool
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100254 ifm_shape = self.op.ifm_shapes[0]
255 ifm_max = add_op_get_ofm(
256 create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, ifm_shape, no_scale_quant)
257 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200258
259 # PASS 1 - Sub+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100260 sub_op_quantization = one_scale_quant.clone()
261 sub_op_quantization.zero_point = 127
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100262 ifm_max_shape = Shape4D([1, ifm_shape.height, ifm_shape.width, 1])
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100263 sub_op = create_sub(
264 f"{self.op.name}_sub{pass_number}",
265 ifm,
266 ifm_max,
267 sub_op_quantization,
268 dtype=DataType.int32,
269 ifm_shape=ifm_shape,
270 ifm2_shape=ifm_max_shape,
271 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200272 sub_op.set_activation_lut(
Tim Hall1c590482023-01-26 17:27:00 +0000273 create_const_tensor(f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.uint32, exp_lut, TensorPurpose.LUT)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200274 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100275 ifm_exp = add_op_get_ofm(sub_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100276 # Note: activation.min/max are non-quantized values
277 sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
278 sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200279
280 # PASS 2 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100281 name = f"{self.op.name}_shr{pass_number}"
Tim Hall3b1578e2023-01-13 17:57:25 +0000282 shift = create_const_tensor(f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], quantization=no_scale_quant)
Louis Verhaard1a92f782021-02-09 16:08:26 +0100283 shr_op = create_shr(name, ifm_exp, shift, no_scale_quant, activation)
Tim Hall5ff4cd12023-05-16 22:39:14 +0100284 shr_op.rounding_mode = RoundingMode.HalfUp
Louis Verhaard1a92f782021-02-09 16:08:26 +0100285 rescaled_exp = add_op_get_ofm(shr_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200286
287 # PASS 3 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100288 sum_of_exp = add_op_get_ofm(
289 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation)
290 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200291
292 # PASS 4 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100293 headroom_plus_one = add_op_get_ofm(
294 create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation)
295 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200296
297 # PASS 5 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100298 headroom_offset = create_const_tensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200299 "headroom_offset_const",
300 [1, 1, 1, 1],
301 DataType.int32,
302 [12 + 31 - 8],
Jonas Ohlssond8575072022-03-30 10:30:25 +0200303 quantization=no_scale_quant,
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200304 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100305 right_shift = add_op_get_ofm(
306 create_sub(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200307 f"{self.op.name}_sub{pass_number}",
308 headroom_offset,
309 headroom_plus_one,
310 no_scale_quant,
311 activation,
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100312 )
313 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200314
315 # PASS 6 - Sub
Tim Hall3b1578e2023-01-13 17:57:25 +0000316 one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100317 headroom = add_op_get_ofm(
318 create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation)
319 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200320
321 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100322 shifted_sum = add_op_get_ofm(
323 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation)
324 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200325
326 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100327 shifted_one = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000328 "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200329 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100330 shifted_sum_minus_one = add_op_get_ofm(
331 create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation)
332 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200333
334 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100335 shifted_sum_minus_one = add_op_get_ofm(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200336 create_shl(
337 f"{self.op.name}_shl{pass_number}",
338 shifted_sum_minus_one,
339 one,
340 no_scale_quant,
341 activation,
342 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100343 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200344
345 # PASS 10 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100346 f0_one_const = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000347 "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200348 )
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200349 add_op = create_add(
350 f"{self.op.name}_add{pass_number}",
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200351 shifted_sum_minus_one,
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200352 f0_one_const,
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200353 one_scale_quant,
354 activation,
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100355 )
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200356 add_op.explicit_scaling = ExplicitScaling(False, shift=[1], multiplier=[1]) # Custom rescale
357 half_denominator = add_op_get_ofm(add_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200358
359 # PASS 11 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100360 neg_32_over_17 = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000361 "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], quantization=one_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200362 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100363 rescaled = add_op_get_ofm(
364 create_mul(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200365 f"{self.op.name}_mul{pass_number}",
366 half_denominator,
367 neg_32_over_17,
368 two_scale_quant,
369 activation2,
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100370 )
371 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200372
373 # PASS 12 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100374 const_48_over_17 = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000375 "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200376 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100377 rescale_w_offset = add_op_get_ofm(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200378 create_add(
379 f"{self.op.name}_add{pass_number}",
380 rescaled,
381 const_48_over_17,
382 one_scale_quant,
383 activation,
384 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100385 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200386
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100387 # PASS 13 - 27
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200388 nr_x = rescale_w_offset
389 F2_one = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000390 "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200391 )
Tim Hall3b1578e2023-01-13 17:57:25 +0000392 four = create_const_tensor("four_const", [1, 1, 1, 1], DataType.int32, [4], quantization=no_scale_quant)
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100393 for _ in range(3):
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200394 # PASS 13, 18, 23 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100395 half_denominator_times_x = add_op_get_ofm(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200396 create_mul(
397 f"{self.op.name}_mul{pass_number}",
398 nr_x,
399 half_denominator,
400 two_scale_quant,
401 activation2,
402 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100403 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200404 # PASS 14, 19, 24 - SUB
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100405 one_minus_half_denominator_times_x = add_op_get_ofm(
406 create_sub(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200407 f"{self.op.name}_sub{pass_number}",
408 F2_one,
409 half_denominator_times_x,
410 one_scale_quant,
411 activation,
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100412 )
413 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200414 # PASS 15, 20, 25 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100415 to_rescale = add_op_get_ofm(
416 create_mul(
417 f"{self.op.name}_mul{pass_number}",
418 nr_x,
419 one_minus_half_denominator_times_x,
420 two_scale_quant,
421 activation2,
422 )
423 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200424 # PASS 16, 21, 26 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100425 to_add = add_op_get_ofm(
426 create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation)
427 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200428 # PASS 17, 22, 27 - ADD
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100429 nr_x = add_op_get_ofm(
430 create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation)
431 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200432
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200433 # PASS 28 - Multiply
Tim Hall3b1578e2023-01-13 17:57:25 +0000434 two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100435 scale_factor = add_op_get_ofm(
436 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation)
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200437 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200438
439 # PASS 29 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100440 scaled_exp = add_op_get_ofm(
441 create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2)
442 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200443
444 # PASS 30 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100445 shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Tim Hall5ff4cd12023-05-16 22:39:14 +0100446 shr30_op.rounding_mode = RoundingMode.HalfUp
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200447 shr30_op.add_input_tensor(scaled_exp)
448 shr30_op.add_input_tensor(right_shift)
449 shr30_op.set_output_tensor(ofm)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100450 shr30_op.ifm_shapes.append(Shape4D(scaled_exp.shape))
451 shr30_op.ifm_shapes.append(Shape4D(right_shift.shape))
452 shr30_op.ofm_shapes.append(Shape4D(scaled_exp.shape))
Tim Halle6ccd872020-11-09 16:46:37 +0000453 DebugDatabase.add_optimised(self.op, shr30_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200454
455 return shr30_op
456
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200457 def get_graph_int16(self, ifm, ofm):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200458 no_scale_quant = ifm.quantization.clone()
459 no_scale_quant.scale_f32 = None
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100460 pass_number = 0
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200461
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100462 def add_op_get_ofm(op):
463 DebugDatabase.add_optimised(self.op, op)
464 nonlocal pass_number
465 pass_number += 1
466 return op.ofm
467
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200468 # PASS 0 - Depthwise Maxpool
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100469 ifm_shape = self.op.ifm_shapes[0]
470 ifm_max = add_op_get_ofm(
471 create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, ifm_shape, no_scale_quant)
472 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200473
474 # PASS 1 - Sub
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100475 ifm_max_shape = Shape4D([1, ifm_shape.height, ifm_shape.width, 1])
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100476 sub1_ofm = add_op_get_ofm(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100477 create_sub(
478 f"{self.op.name}_sub{pass_number}",
479 ifm,
480 ifm_max,
481 ifm.quantization.clone(),
482 dtype=DataType.int32,
483 ifm_shape=ifm_shape,
484 ifm2_shape=ifm_max_shape,
485 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100486 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200487
488 # PASS 2 - Mul
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100489 name = f"{self.op.name}_mul{pass_number}"
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200490 beta = self.op.attrs.get("beta", 1.0)
491 mul2_out_range = 10.0 / 65535.0
492 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100493 scale_quant = ifm.quantization.clone()
494 scale_quant.scale_f32 = beta
495 mul2_quant = ofm.quantization.clone()
496 mul2_quant.scale_f32 = mul2_out_range
497 scale = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000498 f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], quantization=scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200499 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100500 mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200501
502 # PASS 3 - Add+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100503 name = f"{self.op.name}_add{pass_number}"
504 const_add = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000505 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200506 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100507 add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16)
Tim Hall1c590482023-01-26 17:27:00 +0000508 # lut activation values are int32 type however they are defined as Python ints. If these are converted to
509 # numpy.int32 it could result in an overflow error. Therefore, they are forced to uint32 to avoid this
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200510 add_op.set_activation_lut(
Tim Hall1c590482023-01-26 17:27:00 +0000511 create_const_tensor(f"{name}_exp_lut", [1, 1, 1, 512], DataType.uint32, self.EXP_LUT, TensorPurpose.LUT)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200512 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100513 ifm_exp = add_op_get_ofm(add_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200514
515 # PASS 4 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100516 sum_of_exp = add_op_get_ofm(
517 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant)
518 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200519
520 # PASS 5 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100521 headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200522
523 # PASS 6 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100524 name = f"{self.op.name}_sub{pass_number}"
Tim Hall3b1578e2023-01-13 17:57:25 +0000525 const_31 = create_const_tensor(f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100526 reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200527
528 # PASS 7 - SHL
Tim Hall3b1578e2023-01-13 17:57:25 +0000529 one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100530 constant_one = add_op_get_ofm(
531 create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant)
532 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200533
534 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100535 sum_of_exps_minus_one = add_op_get_ofm(
536 create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant)
537 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200538
539 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100540 shifted_sum_minus_one = add_op_get_ofm(
541 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant)
542 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200543
544 # PASS 10 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100545 name = f"{self.op.name}_shr{pass_number}"
Tim Hall3b1578e2023-01-13 17:57:25 +0000546 shift = create_const_tensor(f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100547 shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200548
549 # PASS 11 - Sub+LUT(one over one plus x)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100550 name = f"{self.op.name}_sub{pass_number}"
551 sub11_const = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +0000552 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200553 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100554 sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16)
Tim Hall1c590482023-01-26 17:27:00 +0000555 # lut activation values are int32 type however they are defined as Python ints. If these are converted to
556 # numpy.int32 it could result in an overflow error. Therefore, they are forced to uint32 to avoid this
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200557 sub11_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100558 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100559 f"{name}_one_over_one_plus_x_lut",
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200560 [1, 1, 1, 512],
Tim Hall1c590482023-01-26 17:27:00 +0000561 DataType.uint32,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200562 self.ONE_OVER_ONE_PLUS_X_LUT,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200563 TensorPurpose.LUT,
564 )
565 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100566 reciprocal_scale = add_op_get_ofm(sub11_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200567
568 # PASS 12 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100569 mul_ofm = add_op_get_ofm(
570 create_mul(
571 f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32
572 )
573 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200574
575 # PASS 13 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100576 shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100577 shr13_op.add_input_tensor(mul_ofm)
578 shr13_op.add_input_tensor(reciprocal_right_shift)
579 shr13_op.set_output_tensor(ofm)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100580 shr13_op.ifm_shapes.append(Shape4D(mul_ofm.shape))
581 shr13_op.ifm_shapes.append(Shape4D(reciprocal_right_shift.shape))
582 shr13_op.ofm_shapes.append(Shape4D(mul_ofm.shape))
Tim Halle6ccd872020-11-09 16:46:37 +0000583 DebugDatabase.add_optimised(self.op, shr13_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200584
585 return shr13_op