blob: 3b4bace98949f359f658438db5d6d78dda994cd9 [file] [log] [blame]
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02003# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4#
Fredrik Svedberga0c36242020-06-03 15:43:31 +02005# SPDX-License-Identifier: Apache-2.0
6#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02007# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
Fredrik Svedberga0c36242020-06-03 15:43:31 +02009# You may obtain a copy of the License at
10#
Fredrik Svedberg1575b942020-08-18 13:19:18 +020011# http://www.apache.org/licenses/LICENSE-2.0
Fredrik Svedberga0c36242020-06-03 15:43:31 +020012#
13# Unless required by applicable law or agreed to in writing, software
Fredrik Svedberg1575b942020-08-18 13:19:18 +020014# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Fredrik Svedberga0c36242020-06-03 15:43:31 +020016# See the License for the specific language governing permissions and
17# limitations under the License.
Fredrik Svedberg1575b942020-08-18 13:19:18 +020018#
Fredrik Svedberga0c36242020-06-03 15:43:31 +020019# Description:
20# Contains SoftMax
Fredrik Svedberg1575b942020-08-18 13:19:18 +020021import math
22
Fredrik Svedberga0c36242020-06-03 15:43:31 +020023import numpy as np
24
Fredrik Svedberg1575b942020-08-18 13:19:18 +020025from . import fp_math
Fredrik Svedberga0c36242020-06-03 15:43:31 +020026from . import scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from .api import NpuRoundingMode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020028from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000029from .debug_database import DebugDatabase
Louis Verhaarde8a5a782020-11-02 18:04:27 +010030from .operation import ActivationFunction
Louis Verhaardaee5d752020-09-30 09:01:52 +020031from .operation import Op
Fredrik Svedberga0c36242020-06-03 15:43:31 +020032from .operation import Operation
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010033from .operation_util import create_add
34from .operation_util import create_clz
35from .operation_util import create_depthwise_maxpool
36from .operation_util import create_mul
37from .operation_util import create_reduce_sum
38from .operation_util import create_shl
39from .operation_util import create_shr
40from .operation_util import create_sub
Michael McGeagh5778ffd2020-08-06 17:31:02 +010041from .tensor import create_const_tensor
42from .tensor import create_reshape_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020043from .tensor import TensorPurpose
44
45
Fredrik Svedberga0c36242020-06-03 15:43:31 +020046class SoftMax:
47 # Turn off black formatting for the LUT tables to keep them compact
48 # fmt: off
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +020049
Fredrik Svedberga0c36242020-06-03 15:43:31 +020050 EXP_LUT = [
51 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
52 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
53 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
54 0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
55 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
56 0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
57 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
58 0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
59 0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
60 0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
61 0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
62 0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
63 0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
64 0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
65 0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
66 0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
67 0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
68 0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
69 0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
70 0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
71 0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
72 0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
73 0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
74 0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
75 0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
76 0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
77 0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
78 0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
79 0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
80 0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
81 0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
82 0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
83 0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
84 0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
85 0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
86 0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
87 0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
88 0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
89 0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
90 0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
91 0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
92 0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
93 0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
94 0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
95 0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
96 0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
97 0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
98 0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
99 0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
100 0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
101 0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
102 0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
103 0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
104 0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
105 0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
106 0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
107 0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
108 0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
109 0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
110 0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
111 0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
112 0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
113 0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
114 0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85,
115 ]
116
117 ONE_OVER_ONE_PLUS_X_LUT = [
118 0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
119 0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
120 0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
121 0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
122 0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
123 0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
124 0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
125 0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
126 0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
127 0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
128 0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
129 0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
130 0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
131 0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
132 0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
133 0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
134 0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
135 0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
136 0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
137 0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
138 0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
139 0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
140 0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
141 0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
142 0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
143 0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
144 0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
145 0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
146 0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
147 0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
148 0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
149 0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
150 0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
151 0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
152 0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
153 0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
154 0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
155 0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
156 0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
157 0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
158 0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
159 0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
160 0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
161 0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
162 0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
163 0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
164 0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
165 0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
166 0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
167 0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
168 0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
169 0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
170 0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
171 0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
172 0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
173 0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
174 0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
175 0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
176 0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
177 0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
178 0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
179 0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
180 0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
181 0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
182 ]
183 # fmt: on
184
185 def __init__(self, op):
186 self.op = op
187
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200188 def generate_exp_table(self, beta, input_scale):
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200189 integer_bits = 5
190 total_signed_bits = 31
191 # Calculate scaling
192 real_beta = min(
193 np.double(beta) * np.double(input_scale) * (1 << (31 - integer_bits)), np.double((1 << 31) - 1.0)
194 )
195 scale, shift = scaling.quantise_scale(real_beta)
196 shift = 31 - shift
197 diff_min = -1.0 * math.floor(
198 1.0 * ((1 << integer_bits) - 1) * (1 << (total_signed_bits - integer_bits)) / (1 << shift)
199 )
200 # Generate the exp LUT
201 lut = []
202 for x in range(256):
203 input_diff = x - 255
204 if input_diff >= diff_min:
205 rescale = fp_math.saturating_rounding_mul(input_diff * (1 << shift), scale)
206 lut.append(fp_math.exp_on_negative_values(rescale))
207 else:
208 lut.append(0)
209 return lut
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200210
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200211 def get_graph(self):
212 ifm = self.op.inputs[0]
213 ofm = self.op.outputs[0]
214
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200215 # Reshape ifm/ofm (if needed)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000216 full_shape = self.op.ifm_shapes[0].as_list()
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200217 if full_shape[0] > 1:
218 full_shape[1] *= full_shape[0]
219 full_shape[0] = 1
220 ifm = create_reshape_tensor(ifm, full_shape)
221 ofm = create_reshape_tensor(ofm, full_shape, False)
222
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200223 if ifm.dtype in (DataType.uint8, DataType.int8) and ofm.dtype == ifm.dtype:
224 return self.get_graph_8bit(ifm, ofm)
225 elif ifm.dtype == DataType.int16 and ofm.dtype == DataType.int16:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200226 return self.get_graph_int16(ifm, ofm)
227 else:
228 self.op.run_on_npu = False
229 return self.op
230
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200231 def get_graph_8bit(self, ifm, ofm):
232 exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200233 no_scale_quant = ifm.quantization.clone()
234 no_scale_quant.scale_f32 = None
235 no_scale_quant.zero_point = 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100236 activation = ActivationFunction(Op.Clip)
237 activation.min = ifm.quantization.quant_min
238 activation.max = ifm.quantization.quant_max
239 activation2 = activation.clone()
240 activation2.min = 2 * ifm.quantization.quant_min
241 activation2.max = 2 * ifm.quantization.quant_max
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200242 one_scale_quant = ifm.quantization.clone()
243 one_scale_quant.scale_f32 = 1.0
244 one_scale_quant.zero_point = 0
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100245 two_scale_quant = one_scale_quant.clone()
246 two_scale_quant.scale_f32 = 2.0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200247 ifm.quantization.zero_point = 0
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100248 pass_number = 0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200249
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100250 def add_op_get_ofm(op):
251 DebugDatabase.add_optimised(self.op, op)
252 nonlocal pass_number
253 pass_number += 1
254 return op.ofm
255
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200256 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100257 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200258
259 # PASS 1 - Sub+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100260 sub_op_quantization = one_scale_quant.clone()
261 sub_op_quantization.zero_point = 127
262 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
263 sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200264 sub_op.set_activation_lut(
265 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100266 f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200267 )
268 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100269 ifm_exp = add_op_get_ofm(sub_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100270 # Note: activation.min/max are non-quantized values
271 sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
272 sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200273
274 # PASS 2 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100275 name = f"{self.op.name}_shr{pass_number}"
276 shift = create_const_tensor(
277 f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200278 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100279 rescaled_exp = add_op_get_ofm(
280 create_shr(
281 name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL},
282 )
283 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200284
285 # PASS 3 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100286 sum_of_exp = add_op_get_ofm(
287 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation)
288 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200289
290 # PASS 4 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100291 headroom_plus_one = add_op_get_ofm(
292 create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation)
293 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200294
295 # PASS 5 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100296 headroom_offset = create_const_tensor(
297 "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200298 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100299 right_shift = add_op_get_ofm(
300 create_sub(
301 f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
302 )
303 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200304
305 # PASS 6 - Sub
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200306 one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100307 headroom = add_op_get_ofm(
308 create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation)
309 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200310
311 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100312 shifted_sum = add_op_get_ofm(
313 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation)
314 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200315
316 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100317 shifted_one = create_const_tensor(
318 "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200319 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100320 shifted_sum_minus_one = add_op_get_ofm(
321 create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation)
322 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200323
324 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100325 shifted_sum_minus_one = add_op_get_ofm(
326 create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
327 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200328
329 # PASS 10 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100330 f0_one_const = create_const_tensor(
331 "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200332 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100333 half_denominator = add_op_get_ofm(
334 create_add(
335 f"{self.op.name}_add{pass_number}",
336 f0_one_const,
337 shifted_sum_minus_one,
338 one_scale_quant,
339 activation,
340 attrs={"rescale": (1, 1)},
341 )
342 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200343
344 # PASS 11 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100345 neg_32_over_17 = create_const_tensor(
346 "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200347 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100348 rescaled = add_op_get_ofm(
349 create_mul(
350 f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
351 )
352 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200353
354 # PASS 12 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100355 const_48_over_17 = create_const_tensor(
356 "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200357 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100358 rescale_w_offset = add_op_get_ofm(
359 create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
360 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200361
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100362 # PASS 13 - 27
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200363 nr_x = rescale_w_offset
364 F2_one = create_const_tensor(
365 "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant
366 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200367 four = create_const_tensor(
368 "four_const", [1, 1, 1, 1], DataType.int32, [4], np.int32, quantization=no_scale_quant
369 )
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100370 for _ in range(3):
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200371 # PASS 13, 18, 23 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100372 half_denominator_times_x = add_op_get_ofm(
373 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
374 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200375 # PASS 14, 19, 24 - SUB
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100376 one_minus_half_denominator_times_x = add_op_get_ofm(
377 create_sub(
378 f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
379 )
380 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200381 # PASS 15, 20, 25 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100382 to_rescale = add_op_get_ofm(
383 create_mul(
384 f"{self.op.name}_mul{pass_number}",
385 nr_x,
386 one_minus_half_denominator_times_x,
387 two_scale_quant,
388 activation2,
389 )
390 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200391 # PASS 16, 21, 26 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100392 to_add = add_op_get_ofm(
393 create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation)
394 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200395 # PASS 17, 22, 27 - ADD
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100396 nr_x = add_op_get_ofm(
397 create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation)
398 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200399
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200400 # PASS 28 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100401 two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
402 scale_factor = add_op_get_ofm(
403 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation)
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200404 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200405
406 # PASS 29 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100407 scaled_exp = add_op_get_ofm(
408 create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2)
409 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200410
411 # PASS 30 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100412 shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100413 shr30_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200414 shr30_op.add_input_tensor(scaled_exp)
415 shr30_op.add_input_tensor(right_shift)
416 shr30_op.set_output_tensor(ofm)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000417 shr30_op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +0000418 DebugDatabase.add_optimised(self.op, shr30_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200419
420 return shr30_op
421
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200422 def get_graph_int16(self, ifm, ofm):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200423 no_scale_quant = ifm.quantization.clone()
424 no_scale_quant.scale_f32 = None
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100425 pass_number = 0
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200426
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100427 def add_op_get_ofm(op):
428 DebugDatabase.add_optimised(self.op, op)
429 nonlocal pass_number
430 pass_number += 1
431 return op.ofm
432
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200433 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100434 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200435
436 # PASS 1 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100437 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
438 sub1_ofm = add_op_get_ofm(
439 create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32)
440 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200441
442 # PASS 2 - Mul
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100443 name = f"{self.op.name}_mul{pass_number}"
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200444 beta = self.op.attrs.get("beta", 1.0)
445 mul2_out_range = 10.0 / 65535.0
446 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100447 scale_quant = ifm.quantization.clone()
448 scale_quant.scale_f32 = beta
449 mul2_quant = ofm.quantization.clone()
450 mul2_quant.scale_f32 = mul2_out_range
451 scale = create_const_tensor(
452 f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200453 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100454 mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200455
456 # PASS 3 - Add+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100457 name = f"{self.op.name}_add{pass_number}"
458 const_add = create_const_tensor(
459 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200460 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100461 add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200462 add_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100463 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100464 f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200465 )
466 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100467 ifm_exp = add_op_get_ofm(add_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200468
469 # PASS 4 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100470 sum_of_exp = add_op_get_ofm(
471 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant)
472 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200473
474 # PASS 5 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100475 headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200476
477 # PASS 6 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100478 name = f"{self.op.name}_sub{pass_number}"
479 const_31 = create_const_tensor(
480 f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200481 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100482 reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200483
484 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100485 one = create_const_tensor(
486 f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200487 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100488 constant_one = add_op_get_ofm(
489 create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant)
490 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200491
492 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100493 sum_of_exps_minus_one = add_op_get_ofm(
494 create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant)
495 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200496
497 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100498 shifted_sum_minus_one = add_op_get_ofm(
499 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant)
500 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200501
502 # PASS 10 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100503 name = f"{self.op.name}_shr{pass_number}"
504 shift = create_const_tensor(
505 f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200506 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100507 shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200508
509 # PASS 11 - Sub+LUT(one over one plus x)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100510 name = f"{self.op.name}_sub{pass_number}"
511 sub11_const = create_const_tensor(
512 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200513 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100514 sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200515 sub11_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100516 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100517 f"{name}_one_over_one_plus_x_lut",
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200518 [1, 1, 1, 512],
519 DataType.int32,
520 self.ONE_OVER_ONE_PLUS_X_LUT,
Fredrik Svedberg5b513882020-12-11 13:42:22 +0100521 np.uint32,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200522 TensorPurpose.LUT,
523 )
524 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100525 reciprocal_scale = add_op_get_ofm(sub11_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200526
527 # PASS 12 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100528 mul_ofm = add_op_get_ofm(
529 create_mul(
530 f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32
531 )
532 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200533
534 # PASS 13 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100535 shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100536 shr13_op.add_input_tensor(mul_ofm)
537 shr13_op.add_input_tensor(reciprocal_right_shift)
538 shr13_op.set_output_tensor(ofm)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000539 shr13_op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +0000540 DebugDatabase.add_optimised(self.op, shr13_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200541
542 return shr13_op