blob: 1bdab7408006d12d34e9332f0a5b95ceed61d8cd [file] [log] [blame]
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02003# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4#
Fredrik Svedberga0c36242020-06-03 15:43:31 +02005# SPDX-License-Identifier: Apache-2.0
6#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02007# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
Fredrik Svedberga0c36242020-06-03 15:43:31 +02009# You may obtain a copy of the License at
10#
Fredrik Svedberg1575b942020-08-18 13:19:18 +020011# http://www.apache.org/licenses/LICENSE-2.0
Fredrik Svedberga0c36242020-06-03 15:43:31 +020012#
13# Unless required by applicable law or agreed to in writing, software
Fredrik Svedberg1575b942020-08-18 13:19:18 +020014# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Fredrik Svedberga0c36242020-06-03 15:43:31 +020016# See the License for the specific language governing permissions and
17# limitations under the License.
Fredrik Svedberg1575b942020-08-18 13:19:18 +020018#
Fredrik Svedberga0c36242020-06-03 15:43:31 +020019# Description:
20# Contains SoftMax
Fredrik Svedberg1575b942020-08-18 13:19:18 +020021import math
22
Fredrik Svedberga0c36242020-06-03 15:43:31 +020023import numpy as np
24
Fredrik Svedberg1575b942020-08-18 13:19:18 +020025from . import fp_math
Fredrik Svedberga0c36242020-06-03 15:43:31 +020026from . import scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from .api import NpuRoundingMode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020028from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000029from .debug_database import DebugDatabase
Louis Verhaarde8a5a782020-11-02 18:04:27 +010030from .operation import ActivationFunction
Louis Verhaardaee5d752020-09-30 09:01:52 +020031from .operation import Op
Fredrik Svedberga0c36242020-06-03 15:43:31 +020032from .operation import Operation
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010033from .operation_util import create_add
34from .operation_util import create_clz
35from .operation_util import create_depthwise_maxpool
36from .operation_util import create_mul
37from .operation_util import create_reduce_sum
38from .operation_util import create_shl
39from .operation_util import create_shr
40from .operation_util import create_sub
Michael McGeagh5778ffd2020-08-06 17:31:02 +010041from .tensor import create_const_tensor
42from .tensor import create_reshape_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020043from .tensor import TensorPurpose
44
45
Fredrik Svedberga0c36242020-06-03 15:43:31 +020046class SoftMax:
47 # Turn off black formatting for the LUT tables to keep them compact
48 # fmt: off
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +020049
Fredrik Svedberga0c36242020-06-03 15:43:31 +020050 EXP_LUT = [
51 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
52 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
53 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
54 0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
55 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
56 0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
57 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
58 0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
59 0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
60 0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
61 0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
62 0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
63 0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
64 0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
65 0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
66 0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
67 0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
68 0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
69 0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
70 0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
71 0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
72 0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
73 0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
74 0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
75 0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
76 0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
77 0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
78 0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
79 0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
80 0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
81 0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
82 0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
83 0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
84 0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
85 0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
86 0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
87 0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
88 0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
89 0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
90 0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
91 0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
92 0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
93 0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
94 0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
95 0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
96 0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
97 0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
98 0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
99 0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
100 0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
101 0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
102 0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
103 0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
104 0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
105 0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
106 0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
107 0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
108 0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
109 0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
110 0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
111 0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
112 0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
113 0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
114 0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85,
115 ]
116
117 ONE_OVER_ONE_PLUS_X_LUT = [
118 0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
119 0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
120 0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
121 0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
122 0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
123 0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
124 0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
125 0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
126 0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
127 0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
128 0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
129 0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
130 0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
131 0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
132 0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
133 0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
134 0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
135 0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
136 0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
137 0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
138 0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
139 0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
140 0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
141 0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
142 0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
143 0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
144 0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
145 0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
146 0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
147 0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
148 0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
149 0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
150 0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
151 0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
152 0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
153 0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
154 0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
155 0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
156 0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
157 0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
158 0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
159 0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
160 0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
161 0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
162 0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
163 0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
164 0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
165 0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
166 0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
167 0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
168 0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
169 0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
170 0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
171 0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
172 0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
173 0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
174 0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
175 0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
176 0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
177 0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
178 0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
179 0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
180 0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
181 0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
182 ]
183 # fmt: on
184
185 def __init__(self, op):
186 self.op = op
187
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200188 def generate_exp_table(self, beta, input_scale):
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200189 integer_bits = 5
190 total_signed_bits = 31
191 # Calculate scaling
192 real_beta = min(
193 np.double(beta) * np.double(input_scale) * (1 << (31 - integer_bits)), np.double((1 << 31) - 1.0)
194 )
195 scale, shift = scaling.quantise_scale(real_beta)
196 shift = 31 - shift
197 diff_min = -1.0 * math.floor(
198 1.0 * ((1 << integer_bits) - 1) * (1 << (total_signed_bits - integer_bits)) / (1 << shift)
199 )
200 # Generate the exp LUT
201 lut = []
202 for x in range(256):
203 input_diff = x - 255
204 if input_diff >= diff_min:
205 rescale = fp_math.saturating_rounding_mul(input_diff * (1 << shift), scale)
206 lut.append(fp_math.exp_on_negative_values(rescale))
207 else:
208 lut.append(0)
209 return lut
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200210
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200211 def get_graph(self):
212 ifm = self.op.inputs[0]
213 ofm = self.op.outputs[0]
214
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200215 # Reshape ifm/ofm (if needed)
216 full_shape = ifm.get_full_shape()
217 if full_shape[0] > 1:
218 full_shape[1] *= full_shape[0]
219 full_shape[0] = 1
220 ifm = create_reshape_tensor(ifm, full_shape)
221 ofm = create_reshape_tensor(ofm, full_shape, False)
222
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200223 if ifm.dtype in (DataType.uint8, DataType.int8) and ofm.dtype == ifm.dtype:
224 return self.get_graph_8bit(ifm, ofm)
225 elif ifm.dtype == DataType.int16 and ofm.dtype == DataType.int16:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200226 return self.get_graph_int16(ifm, ofm)
227 else:
228 self.op.run_on_npu = False
229 return self.op
230
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200231 def get_graph_8bit(self, ifm, ofm):
232 exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32)
Tim Halle6ccd872020-11-09 16:46:37 +0000233 ifm = create_reshape_tensor(ifm, ifm.get_full_shape())
234 DebugDatabase.add_optimised(self.op, ifm.ops[0])
235 ofm = create_reshape_tensor(ofm, ofm.get_full_shape(), False)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200236 no_scale_quant = ifm.quantization.clone()
237 no_scale_quant.scale_f32 = None
238 no_scale_quant.zero_point = 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100239 activation = ActivationFunction(Op.Clip)
240 activation.min = ifm.quantization.quant_min
241 activation.max = ifm.quantization.quant_max
242 activation2 = activation.clone()
243 activation2.min = 2 * ifm.quantization.quant_min
244 activation2.max = 2 * ifm.quantization.quant_max
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200245 one_scale_quant = ifm.quantization.clone()
246 one_scale_quant.scale_f32 = 1.0
247 one_scale_quant.zero_point = 0
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100248 two_scale_quant = one_scale_quant.clone()
249 two_scale_quant.scale_f32 = 2.0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200250 ifm.quantization.zero_point = 0
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100251 pass_number = 0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200252
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100253 def add_op_get_ofm(op):
254 DebugDatabase.add_optimised(self.op, op)
255 nonlocal pass_number
256 pass_number += 1
257 return op.ofm
258
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200259 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100260 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200261
262 # PASS 1 - Sub+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100263 sub_op_quantization = one_scale_quant.clone()
264 sub_op_quantization.zero_point = 127
265 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
266 sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200267 sub_op.set_activation_lut(
268 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100269 f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200270 )
271 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100272 ifm_exp = add_op_get_ofm(sub_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100273 # Note: activation.min/max are non-quantized values
274 sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
275 sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200276
277 # PASS 2 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100278 name = f"{self.op.name}_shr{pass_number}"
279 shift = create_const_tensor(
280 f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200281 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100282 rescaled_exp = add_op_get_ofm(
283 create_shr(
284 name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL},
285 )
286 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200287
288 # PASS 3 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100289 sum_of_exp = add_op_get_ofm(
290 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation)
291 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200292
293 # PASS 4 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100294 headroom_plus_one = add_op_get_ofm(
295 create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation)
296 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200297
298 # PASS 5 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100299 headroom_offset = create_const_tensor(
300 "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200301 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100302 right_shift = add_op_get_ofm(
303 create_sub(
304 f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
305 )
306 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200307
308 # PASS 6 - Sub
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200309 one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100310 headroom = add_op_get_ofm(
311 create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation)
312 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200313
314 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100315 shifted_sum = add_op_get_ofm(
316 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation)
317 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200318
319 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100320 shifted_one = create_const_tensor(
321 "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200322 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100323 shifted_sum_minus_one = add_op_get_ofm(
324 create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation)
325 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200326
327 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100328 shifted_sum_minus_one = add_op_get_ofm(
329 create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
330 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200331
332 # PASS 10 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100333 f0_one_const = create_const_tensor(
334 "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200335 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100336 half_denominator = add_op_get_ofm(
337 create_add(
338 f"{self.op.name}_add{pass_number}",
339 f0_one_const,
340 shifted_sum_minus_one,
341 one_scale_quant,
342 activation,
343 attrs={"rescale": (1, 1)},
344 )
345 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200346
347 # PASS 11 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100348 neg_32_over_17 = create_const_tensor(
349 "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200350 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100351 rescaled = add_op_get_ofm(
352 create_mul(
353 f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
354 )
355 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200356
357 # PASS 12 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100358 const_48_over_17 = create_const_tensor(
359 "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200360 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100361 rescale_w_offset = add_op_get_ofm(
362 create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
363 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200364
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100365 # PASS 13 - 27
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200366 nr_x = rescale_w_offset
367 F2_one = create_const_tensor(
368 "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant
369 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200370 four = create_const_tensor(
371 "four_const", [1, 1, 1, 1], DataType.int32, [4], np.int32, quantization=no_scale_quant
372 )
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100373 for _ in range(3):
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200374 # PASS 13, 18, 23 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100375 half_denominator_times_x = add_op_get_ofm(
376 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
377 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200378 # PASS 14, 19, 24 - SUB
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100379 one_minus_half_denominator_times_x = add_op_get_ofm(
380 create_sub(
381 f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
382 )
383 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200384 # PASS 15, 20, 25 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100385 to_rescale = add_op_get_ofm(
386 create_mul(
387 f"{self.op.name}_mul{pass_number}",
388 nr_x,
389 one_minus_half_denominator_times_x,
390 two_scale_quant,
391 activation2,
392 )
393 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200394 # PASS 16, 21, 26 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100395 to_add = add_op_get_ofm(
396 create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation)
397 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200398 # PASS 17, 22, 27 - ADD
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100399 nr_x = add_op_get_ofm(
400 create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation)
401 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200402
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200403 # PASS 28 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100404 two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
405 scale_factor = add_op_get_ofm(
406 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation)
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200407 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200408
409 # PASS 29 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100410 scaled_exp = add_op_get_ofm(
411 create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2)
412 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200413
414 # PASS 30 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100415 shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100416 shr30_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200417 shr30_op.add_input_tensor(scaled_exp)
418 shr30_op.add_input_tensor(right_shift)
419 shr30_op.set_output_tensor(ofm)
Tim Halle6ccd872020-11-09 16:46:37 +0000420 DebugDatabase.add_optimised(self.op, shr30_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200421
422 return shr30_op
423
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200424 def get_graph_int16(self, ifm, ofm):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200425 no_scale_quant = ifm.quantization.clone()
426 no_scale_quant.scale_f32 = None
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100427 pass_number = 0
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200428
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100429 def add_op_get_ofm(op):
430 DebugDatabase.add_optimised(self.op, op)
431 nonlocal pass_number
432 pass_number += 1
433 return op.ofm
434
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200435 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100436 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200437
438 # PASS 1 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100439 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
440 sub1_ofm = add_op_get_ofm(
441 create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32)
442 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200443
444 # PASS 2 - Mul
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100445 name = f"{self.op.name}_mul{pass_number}"
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200446 beta = self.op.attrs.get("beta", 1.0)
447 mul2_out_range = 10.0 / 65535.0
448 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100449 scale_quant = ifm.quantization.clone()
450 scale_quant.scale_f32 = beta
451 mul2_quant = ofm.quantization.clone()
452 mul2_quant.scale_f32 = mul2_out_range
453 scale = create_const_tensor(
454 f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200455 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100456 mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200457
458 # PASS 3 - Add+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100459 name = f"{self.op.name}_add{pass_number}"
460 const_add = create_const_tensor(
461 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200462 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100463 add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200464 add_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100465 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100466 f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200467 )
468 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100469 ifm_exp = add_op_get_ofm(add_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200470
471 # PASS 4 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100472 sum_of_exp = add_op_get_ofm(
473 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant)
474 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200475
476 # PASS 5 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100477 headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200478
479 # PASS 6 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100480 name = f"{self.op.name}_sub{pass_number}"
481 const_31 = create_const_tensor(
482 f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200483 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100484 reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200485
486 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100487 one = create_const_tensor(
488 f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200489 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100490 constant_one = add_op_get_ofm(
491 create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant)
492 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200493
494 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100495 sum_of_exps_minus_one = add_op_get_ofm(
496 create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant)
497 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200498
499 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100500 shifted_sum_minus_one = add_op_get_ofm(
501 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant)
502 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200503
504 # PASS 10 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100505 name = f"{self.op.name}_shr{pass_number}"
506 shift = create_const_tensor(
507 f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200508 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100509 shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200510
511 # PASS 11 - Sub+LUT(one over one plus x)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100512 name = f"{self.op.name}_sub{pass_number}"
513 sub11_const = create_const_tensor(
514 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200515 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100516 sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200517 sub11_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100518 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100519 f"{name}_one_over_one_plus_x_lut",
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200520 [1, 1, 1, 512],
521 DataType.int32,
522 self.ONE_OVER_ONE_PLUS_X_LUT,
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200523 np.int32,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200524 TensorPurpose.LUT,
525 )
526 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100527 reciprocal_scale = add_op_get_ofm(sub11_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200528
529 # PASS 12 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100530 mul_ofm = add_op_get_ofm(
531 create_mul(
532 f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32
533 )
534 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200535
536 # PASS 13 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100537 shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100538 shr13_op.add_input_tensor(mul_ofm)
539 shr13_op.add_input_tensor(reciprocal_right_shift)
540 shr13_op.set_output_tensor(ofm)
Tim Halle6ccd872020-11-09 16:46:37 +0000541 DebugDatabase.add_optimised(self.op, shr13_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200542
543 return shr13_op