blob: 8a1770e1e6830a68bcce9892b5d0a29f6de9cd4f [file] [log] [blame]
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02003# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4#
Fredrik Svedberga0c36242020-06-03 15:43:31 +02005# SPDX-License-Identifier: Apache-2.0
6#
Fredrik Svedberg1575b942020-08-18 13:19:18 +02007# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
Fredrik Svedberga0c36242020-06-03 15:43:31 +02009# You may obtain a copy of the License at
10#
Fredrik Svedberg1575b942020-08-18 13:19:18 +020011# http://www.apache.org/licenses/LICENSE-2.0
Fredrik Svedberga0c36242020-06-03 15:43:31 +020012#
13# Unless required by applicable law or agreed to in writing, software
Fredrik Svedberg1575b942020-08-18 13:19:18 +020014# distributed under the License is distributed on an "AS IS" BASIS,
15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Fredrik Svedberga0c36242020-06-03 15:43:31 +020016# See the License for the specific language governing permissions and
17# limitations under the License.
Fredrik Svedberg1575b942020-08-18 13:19:18 +020018#
Fredrik Svedberga0c36242020-06-03 15:43:31 +020019# Description:
20# Contains SoftMax
Fredrik Svedberg1575b942020-08-18 13:19:18 +020021import math
22
Fredrik Svedberga0c36242020-06-03 15:43:31 +020023import numpy as np
24
Fredrik Svedberg1575b942020-08-18 13:19:18 +020025from . import fp_math
Fredrik Svedberga0c36242020-06-03 15:43:31 +020026from . import scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from .api import NpuRoundingMode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020028from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000029from .debug_database import DebugDatabase
Louis Verhaarde8a5a782020-11-02 18:04:27 +010030from .operation import ActivationFunction
Louis Verhaardaee5d752020-09-30 09:01:52 +020031from .operation import Op
Fredrik Svedberga0c36242020-06-03 15:43:31 +020032from .operation import Operation
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010033from .operation_util import create_add
34from .operation_util import create_clz
35from .operation_util import create_depthwise_maxpool
36from .operation_util import create_mul
37from .operation_util import create_reduce_sum
Fredrik Svedberge82be7c2021-01-18 15:21:03 +010038from .operation_util import create_rescale_add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010039from .operation_util import create_shl
40from .operation_util import create_shr
41from .operation_util import create_sub
Michael McGeagh5778ffd2020-08-06 17:31:02 +010042from .tensor import create_const_tensor
43from .tensor import create_reshape_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020044from .tensor import TensorPurpose
45
46
Fredrik Svedberga0c36242020-06-03 15:43:31 +020047class SoftMax:
48 # Turn off black formatting for the LUT tables to keep them compact
49 # fmt: off
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +020050
Fredrik Svedberga0c36242020-06-03 15:43:31 +020051 EXP_LUT = [
52 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
53 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
54 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
55 0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
56 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
57 0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
58 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
59 0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
60 0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
61 0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
62 0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
63 0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
64 0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
65 0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
66 0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
67 0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
68 0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
69 0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
70 0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
71 0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
72 0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
73 0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
74 0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
75 0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
76 0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
77 0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
78 0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
79 0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
80 0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
81 0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
82 0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
83 0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
84 0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
85 0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
86 0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
87 0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
88 0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
89 0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
90 0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
91 0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
92 0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
93 0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
94 0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
95 0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
96 0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
97 0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
98 0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
99 0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
100 0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
101 0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
102 0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
103 0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
104 0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
105 0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
106 0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
107 0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
108 0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
109 0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
110 0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
111 0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
112 0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
113 0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
114 0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
115 0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85,
116 ]
117
118 ONE_OVER_ONE_PLUS_X_LUT = [
119 0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
120 0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
121 0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
122 0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
123 0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
124 0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
125 0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
126 0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
127 0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
128 0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
129 0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
130 0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
131 0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
132 0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
133 0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
134 0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
135 0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
136 0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
137 0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
138 0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
139 0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
140 0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
141 0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
142 0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
143 0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
144 0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
145 0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
146 0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
147 0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
148 0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
149 0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
150 0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
151 0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
152 0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
153 0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
154 0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
155 0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
156 0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
157 0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
158 0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
159 0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
160 0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
161 0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
162 0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
163 0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
164 0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
165 0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
166 0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
167 0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
168 0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
169 0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
170 0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
171 0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
172 0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
173 0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
174 0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
175 0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
176 0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
177 0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
178 0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
179 0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
180 0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
181 0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
182 0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
183 ]
184 # fmt: on
185
186 def __init__(self, op):
187 self.op = op
188
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200189 def generate_exp_table(self, beta, input_scale):
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200190 integer_bits = 5
191 total_signed_bits = 31
192 # Calculate scaling
193 real_beta = min(
194 np.double(beta) * np.double(input_scale) * (1 << (31 - integer_bits)), np.double((1 << 31) - 1.0)
195 )
196 scale, shift = scaling.quantise_scale(real_beta)
197 shift = 31 - shift
198 diff_min = -1.0 * math.floor(
199 1.0 * ((1 << integer_bits) - 1) * (1 << (total_signed_bits - integer_bits)) / (1 << shift)
200 )
201 # Generate the exp LUT
202 lut = []
203 for x in range(256):
204 input_diff = x - 255
205 if input_diff >= diff_min:
206 rescale = fp_math.saturating_rounding_mul(input_diff * (1 << shift), scale)
207 lut.append(fp_math.exp_on_negative_values(rescale))
208 else:
209 lut.append(0)
210 return lut
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200211
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200212 def get_graph(self):
213 ifm = self.op.inputs[0]
214 ofm = self.op.outputs[0]
215
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200216 # Reshape ifm/ofm (if needed)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000217 full_shape = self.op.ifm_shapes[0].as_list()
Fredrik Svedberg835d8e12020-09-04 09:46:17 +0200218 if full_shape[0] > 1:
219 full_shape[1] *= full_shape[0]
220 full_shape[0] = 1
221 ifm = create_reshape_tensor(ifm, full_shape)
222 ofm = create_reshape_tensor(ofm, full_shape, False)
223
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200224 if ifm.dtype in (DataType.uint8, DataType.int8) and ofm.dtype == ifm.dtype:
225 return self.get_graph_8bit(ifm, ofm)
226 elif ifm.dtype == DataType.int16 and ofm.dtype == DataType.int16:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200227 return self.get_graph_int16(ifm, ofm)
228 else:
229 self.op.run_on_npu = False
230 return self.op
231
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200232 def get_graph_8bit(self, ifm, ofm):
233 exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200234 no_scale_quant = ifm.quantization.clone()
235 no_scale_quant.scale_f32 = None
236 no_scale_quant.zero_point = 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100237 activation = ActivationFunction(Op.Clip)
238 activation.min = ifm.quantization.quant_min
239 activation.max = ifm.quantization.quant_max
240 activation2 = activation.clone()
241 activation2.min = 2 * ifm.quantization.quant_min
242 activation2.max = 2 * ifm.quantization.quant_max
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200243 one_scale_quant = ifm.quantization.clone()
244 one_scale_quant.scale_f32 = 1.0
245 one_scale_quant.zero_point = 0
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100246 two_scale_quant = one_scale_quant.clone()
247 two_scale_quant.scale_f32 = 2.0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200248 ifm.quantization.zero_point = 0
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100249 pass_number = 0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200250
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100251 def add_op_get_ofm(op):
252 DebugDatabase.add_optimised(self.op, op)
253 nonlocal pass_number
254 pass_number += 1
255 return op.ofm
256
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200257 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100258 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200259
260 # PASS 1 - Sub+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100261 sub_op_quantization = one_scale_quant.clone()
262 sub_op_quantization.zero_point = 127
263 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
264 sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200265 sub_op.set_activation_lut(
266 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100267 f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200268 )
269 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100270 ifm_exp = add_op_get_ofm(sub_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100271 # Note: activation.min/max are non-quantized values
272 sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
273 sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200274
275 # PASS 2 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100276 name = f"{self.op.name}_shr{pass_number}"
277 shift = create_const_tensor(
278 f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200279 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100280 rescaled_exp = add_op_get_ofm(
281 create_shr(
282 name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL},
283 )
284 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200285
286 # PASS 3 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100287 sum_of_exp = add_op_get_ofm(
288 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation)
289 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200290
291 # PASS 4 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100292 headroom_plus_one = add_op_get_ofm(
293 create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation)
294 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200295
296 # PASS 5 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100297 headroom_offset = create_const_tensor(
298 "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200299 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100300 right_shift = add_op_get_ofm(
301 create_sub(
302 f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
303 )
304 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200305
306 # PASS 6 - Sub
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200307 one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100308 headroom = add_op_get_ofm(
309 create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation)
310 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200311
312 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100313 shifted_sum = add_op_get_ofm(
314 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation)
315 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200316
317 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100318 shifted_one = create_const_tensor(
319 "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200320 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100321 shifted_sum_minus_one = add_op_get_ofm(
322 create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation)
323 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200324
325 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100326 shifted_sum_minus_one = add_op_get_ofm(
327 create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
328 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200329
330 # PASS 10 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100331 f0_one_const = create_const_tensor(
332 "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200333 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100334 half_denominator = add_op_get_ofm(
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100335 create_rescale_add(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100336 f"{self.op.name}_add{pass_number}",
337 f0_one_const,
338 shifted_sum_minus_one,
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100339 (1, 1), # Custom rescale
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100340 one_scale_quant,
341 activation,
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100342 )
343 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200344
345 # PASS 11 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100346 neg_32_over_17 = create_const_tensor(
347 "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200348 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100349 rescaled = add_op_get_ofm(
350 create_mul(
351 f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
352 )
353 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200354
355 # PASS 12 - Add
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100356 const_48_over_17 = create_const_tensor(
357 "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200358 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100359 rescale_w_offset = add_op_get_ofm(
360 create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
361 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200362
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100363 # PASS 13 - 27
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200364 nr_x = rescale_w_offset
365 F2_one = create_const_tensor(
366 "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant
367 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200368 four = create_const_tensor(
369 "four_const", [1, 1, 1, 1], DataType.int32, [4], np.int32, quantization=no_scale_quant
370 )
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100371 for _ in range(3):
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200372 # PASS 13, 18, 23 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100373 half_denominator_times_x = add_op_get_ofm(
374 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
375 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200376 # PASS 14, 19, 24 - SUB
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100377 one_minus_half_denominator_times_x = add_op_get_ofm(
378 create_sub(
379 f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
380 )
381 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200382 # PASS 15, 20, 25 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100383 to_rescale = add_op_get_ofm(
384 create_mul(
385 f"{self.op.name}_mul{pass_number}",
386 nr_x,
387 one_minus_half_denominator_times_x,
388 two_scale_quant,
389 activation2,
390 )
391 )
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200392 # PASS 16, 21, 26 - MUL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100393 to_add = add_op_get_ofm(
394 create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation)
395 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200396 # PASS 17, 22, 27 - ADD
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100397 nr_x = add_op_get_ofm(
398 create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation)
399 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200400
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200401 # PASS 28 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100402 two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
403 scale_factor = add_op_get_ofm(
404 create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation)
Fredrik Svedberg880e7352020-08-25 11:31:47 +0200405 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200406
407 # PASS 29 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100408 scaled_exp = add_op_get_ofm(
409 create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2)
410 )
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200411
412 # PASS 30 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100413 shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100414 shr30_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200415 shr30_op.add_input_tensor(scaled_exp)
416 shr30_op.add_input_tensor(right_shift)
417 shr30_op.set_output_tensor(ofm)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000418 shr30_op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +0000419 DebugDatabase.add_optimised(self.op, shr30_op)
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200420
421 return shr30_op
422
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200423 def get_graph_int16(self, ifm, ofm):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200424 no_scale_quant = ifm.quantization.clone()
425 no_scale_quant.scale_f32 = None
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100426 pass_number = 0
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200427
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100428 def add_op_get_ofm(op):
429 DebugDatabase.add_optimised(self.op, op)
430 nonlocal pass_number
431 pass_number += 1
432 return op.ofm
433
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200434 # PASS 0 - Depthwise Maxpool
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100435 ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200436
437 # PASS 1 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100438 ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
439 sub1_ofm = add_op_get_ofm(
440 create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32)
441 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200442
443 # PASS 2 - Mul
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100444 name = f"{self.op.name}_mul{pass_number}"
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200445 beta = self.op.attrs.get("beta", 1.0)
446 mul2_out_range = 10.0 / 65535.0
447 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100448 scale_quant = ifm.quantization.clone()
449 scale_quant.scale_f32 = beta
450 mul2_quant = ofm.quantization.clone()
451 mul2_quant.scale_f32 = mul2_out_range
452 scale = create_const_tensor(
453 f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200454 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100455 mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200456
457 # PASS 3 - Add+LUT(exp)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100458 name = f"{self.op.name}_add{pass_number}"
459 const_add = create_const_tensor(
460 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200461 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100462 add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200463 add_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100464 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100465 f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200466 )
467 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100468 ifm_exp = add_op_get_ofm(add_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200469
470 # PASS 4 - Reduce sum
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100471 sum_of_exp = add_op_get_ofm(
472 create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant)
473 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200474
475 # PASS 5 - CLZ
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100476 headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200477
478 # PASS 6 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100479 name = f"{self.op.name}_sub{pass_number}"
480 const_31 = create_const_tensor(
481 f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200482 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100483 reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200484
485 # PASS 7 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100486 one = create_const_tensor(
487 f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200488 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100489 constant_one = add_op_get_ofm(
490 create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant)
491 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200492
493 # PASS 8 - Sub
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100494 sum_of_exps_minus_one = add_op_get_ofm(
495 create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant)
496 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200497
498 # PASS 9 - SHL
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100499 shifted_sum_minus_one = add_op_get_ofm(
500 create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant)
501 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200502
503 # PASS 10 - SHR
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100504 name = f"{self.op.name}_shr{pass_number}"
505 shift = create_const_tensor(
506 f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200507 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100508 shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant))
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200509
510 # PASS 11 - Sub+LUT(one over one plus x)
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100511 name = f"{self.op.name}_sub{pass_number}"
512 sub11_const = create_const_tensor(
513 f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200514 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100515 sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200516 sub11_op.set_activation_lut(
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100517 create_const_tensor(
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100518 f"{name}_one_over_one_plus_x_lut",
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200519 [1, 1, 1, 512],
520 DataType.int32,
521 self.ONE_OVER_ONE_PLUS_X_LUT,
Fredrik Svedberg5b513882020-12-11 13:42:22 +0100522 np.uint32,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200523 TensorPurpose.LUT,
524 )
525 )
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100526 reciprocal_scale = add_op_get_ofm(sub11_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200527
528 # PASS 12 - Multiply
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +0100529 mul_ofm = add_op_get_ofm(
530 create_mul(
531 f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32
532 )
533 )
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200534
535 # PASS 13 - SHR
Fredrik Svedberg32c7f5b2020-12-02 09:24:29 +0100536 shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
Michael McGeagh5778ffd2020-08-06 17:31:02 +0100537 shr13_op.add_input_tensor(mul_ofm)
538 shr13_op.add_input_tensor(reciprocal_right_shift)
539 shr13_op.set_output_tensor(ofm)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000540 shr13_op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +0000541 DebugDatabase.add_optimised(self.op, shr13_op)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200542
543 return shr13_op