Blame - ethosu/vela/lut.py - ml/ethos-u/ethos-u-vela

blob: e3373ca239319d426eb897781f01d6db84f9aaf1 [file] [log] [blame]

Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Functionality for lookup table support.
				18	import uuid
				19	from functools import lru_cache
				20
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame]	21	import numpy as np
				22
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	23	from . import numeric_util
				24	from .high_level_command_stream import CommandType
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame]	25	from .tensor import create_const_tensor
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	26	from .tensor import TensorPurpose
				27
				28
				29	@lru_cache(maxsize=None)
				30	def create_equivalence_id(key):
				31	# Generates equivalence_id based on key.
				32	# The DMA optimization of LUT-s assumes that 2 LUT tensors are identical
				33	# if they have the same equivalence_id.
				34	# So for example all created 256-byte tanh LUT tensors should have
				35	# the same equivalence id.
				36	return uuid.uuid4()
				37
				38
				39	class LUTState:
				40	# Tracks which LUT-s are located in SHRAM.
				41	def __init__(self):
				42	self.tensors = []
				43
				44	def get_equivalent(self, lut_tens):
Jacob Bohlin	1a66697	2020-09-11 10:04:15 +0200	[diff] [blame^]	45	# Returns existing lut with the same values, None if not found
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	46	for t in self.tensors:
Jacob Bohlin	1a66697	2020-09-11 10:04:15 +0200	[diff] [blame^]	47	if np.array_equal(t.values, lut_tens.values):
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	48	return t
				49	return None
				50
				51	def put(self, lut_tens):
				52	# Returns new LUT state containing given tensor + all tensors in this state
				53	# that do not overlap with the given tensor
				54	new_state = LUTState()
				55	new_state.tensors.append(lut_tens)
				56	start = lut_tens.address
				57	end = start + lut_tens.storage_size()
				58	for tens in self.tensors:
				59	start2 = tens.address
				60	end2 = start2 + tens.storage_size()
				61	if not numeric_util.overlaps(start, end, start2, end2):
				62	new_state.tensors.append(tens)
Jacob Bohlin	1a66697	2020-09-11 10:04:15 +0200	[diff] [blame^]	63
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	64	return new_state
				65
				66	def find_best_address(self, start, stop, step):
				67	# Finds the address in the given range that overlaps with the minimum number of
				68	# currently present LUT-s.
				69	# An improvement would be to also take future LUT usage into account
				70	best_addr = start
				71	best_nr_overlaps = stop
				72	for addr in range(start, stop, step):
				73	nr_overlaps = 0
				74	for tens in self.tensors:
				75	start2 = tens.address
				76	end2 = start2 + tens.storage_size()
				77	if numeric_util.overlaps(addr, addr + step, start2, end2):
				78	nr_overlaps += 1
				79	if nr_overlaps < best_nr_overlaps:
				80	best_nr_overlaps = nr_overlaps
				81	best_addr = addr
				82	return best_addr
				83
				84
				85	def get_lut_index(arch, lut_tensor):
				86	# Returns the index in SHRAM where the given LUT is stored, a value between 0 and 8
				87	slot = (lut_tensor.address - arch.shram_lut_address) // lut_tensor.storage_size()
				88	assert 0 <= slot < 8
				89	return slot
				90
				91
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame]	92	def create_lut_tensor(name, values, dtype):
				93	# Creates constant LUT tensor with the given values as lookup table.
				94	# The tensor's equivalence_id is based on these values, so if multiple
				95	# LUT tensors are created with identical values, they will get the same
				96	# address in constant memory, and unnecessary DMA operations can be avoided.
				97	sz = len(values)
				98	assert sz in (256, 512)
				99	ntype = np.uint8 if dtype.size_in_bytes() == 1 else np.uint32
				100	tens = create_const_tensor(name, [1, 1, 1, sz], dtype, values, ntype, TensorPurpose.LUT)
				101	tens.equivalence_id = create_equivalence_id(tuple(values))
				102	return tens
				103
				104
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	105	def optimize_high_level_cmd_stream(sg, arch):
				106	# - Allocates SHRAM address/lut index to LUT tensors
				107	# - Removes unnecessary DMA operations of LUT-s that are already present in SHRAM from sg's command stream
				108	cmd_stream = [] # will contain existing command stream minus unneeded DMA operations
				109	lut_state = LUTState()
				110	slot_size = 256
				111	lut_start = arch.shram_lut_address
				112	lut_end = lut_start + arch.shram_lut_size
				113	for cmd in sg.high_level_command_stream:
				114	if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.lut_tensor is None and arch.shram_reserved_unused_banks == 0:
				115	# The command overwrites the last 2 banks containing the LUT; next LUT operation will require DMA
				116	# TODO: check the command's SHRAM usage in more detail to determine if the LUT is overwritten or not
				117	lut_state = LUTState()
				118	if cmd.cmdtype != CommandType.DMA or cmd.out_tensor.purpose != TensorPurpose.LUT:
				119	# Non-LUT operation; leave untouched
				120	cmd_stream.append(cmd)
				121	continue
				122	# LUT DMA operation
				123	lut_tens = cmd.out_tensor
				124	existing_tens = lut_state.get_equivalent(lut_tens)
				125	if existing_tens is not None:
				126	# LUT is already in SHRAM, no need to perform DMA
				127	lut_tens.address = existing_tens.address
				128	cmd.ps.primary_op.attrs["lut_index"] = get_lut_index(arch, existing_tens)
				129	continue
				130	# Place the LUT in the last 2 blocks of SHRAM
				131	# Alignment is always on the size of the LUT, 256 for 256-byte LUT, 1K for 1K LUT, etc
				132	address = lut_state.find_best_address(lut_start, lut_end, lut_tens.storage_size())
Jacob Bohlin	1a66697	2020-09-11 10:04:15 +0200	[diff] [blame^]	133	lut_tens.equivalence_id = uuid.uuid4()
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	134	lut_tens.address = address
				135	cmd.ps.primary_op.attrs["lut_index"] = (address - lut_start) // slot_size
				136	lut_state = lut_state.put(lut_tens)
				137	cmd_stream.append(cmd)
				138	sg.high_level_command_stream = cmd_stream