Blame - ethosu/vela/lut.py - ml/ethos-u/ethos-u-vela

blob: 0e8dcc95a56fe003ec043df2d59b5f1945392332 [file] [log] [blame]

Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Functionality for lookup table support.
				18	import uuid
				19	from functools import lru_cache
				20
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame^]	21	import numpy as np
				22
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	23	from . import numeric_util
				24	from .high_level_command_stream import CommandType
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame^]	25	from .tensor import create_const_tensor
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	26	from .tensor import TensorPurpose
				27
				28
				29	@lru_cache(maxsize=None)
				30	def create_equivalence_id(key):
				31	# Generates equivalence_id based on key.
				32	# The DMA optimization of LUT-s assumes that 2 LUT tensors are identical
				33	# if they have the same equivalence_id.
				34	# So for example all created 256-byte tanh LUT tensors should have
				35	# the same equivalence id.
				36	return uuid.uuid4()
				37
				38
				39	class LUTState:
				40	# Tracks which LUT-s are located in SHRAM.
				41	def __init__(self):
				42	self.tensors = []
				43
				44	def get_equivalent(self, lut_tens):
				45	# Returns existing lut with same equivalence id, None if not found
				46	for t in self.tensors:
				47	if t.equivalent(lut_tens):
				48	return t
				49	return None
				50
				51	def put(self, lut_tens):
				52	# Returns new LUT state containing given tensor + all tensors in this state
				53	# that do not overlap with the given tensor
				54	new_state = LUTState()
				55	new_state.tensors.append(lut_tens)
				56	start = lut_tens.address
				57	end = start + lut_tens.storage_size()
				58	for tens in self.tensors:
				59	start2 = tens.address
				60	end2 = start2 + tens.storage_size()
				61	if not numeric_util.overlaps(start, end, start2, end2):
				62	new_state.tensors.append(tens)
				63	return new_state
				64
				65	def find_best_address(self, start, stop, step):
				66	# Finds the address in the given range that overlaps with the minimum number of
				67	# currently present LUT-s.
				68	# An improvement would be to also take future LUT usage into account
				69	best_addr = start
				70	best_nr_overlaps = stop
				71	for addr in range(start, stop, step):
				72	nr_overlaps = 0
				73	for tens in self.tensors:
				74	start2 = tens.address
				75	end2 = start2 + tens.storage_size()
				76	if numeric_util.overlaps(addr, addr + step, start2, end2):
				77	nr_overlaps += 1
				78	if nr_overlaps < best_nr_overlaps:
				79	best_nr_overlaps = nr_overlaps
				80	best_addr = addr
				81	return best_addr
				82
				83
				84	def get_lut_index(arch, lut_tensor):
				85	# Returns the index in SHRAM where the given LUT is stored, a value between 0 and 8
				86	slot = (lut_tensor.address - arch.shram_lut_address) // lut_tensor.storage_size()
				87	assert 0 <= slot < 8
				88	return slot
				89
				90
Louis Verhaard	b9fc33c	2020-08-13 11:47:36 +0200	[diff] [blame^]	91	def create_lut_tensor(name, values, dtype):
				92	# Creates constant LUT tensor with the given values as lookup table.
				93	# The tensor's equivalence_id is based on these values, so if multiple
				94	# LUT tensors are created with identical values, they will get the same
				95	# address in constant memory, and unnecessary DMA operations can be avoided.
				96	sz = len(values)
				97	assert sz in (256, 512)
				98	ntype = np.uint8 if dtype.size_in_bytes() == 1 else np.uint32
				99	tens = create_const_tensor(name, [1, 1, 1, sz], dtype, values, ntype, TensorPurpose.LUT)
				100	tens.equivalence_id = create_equivalence_id(tuple(values))
				101	return tens
				102
				103
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	104	def optimize_high_level_cmd_stream(sg, arch):
				105	# - Allocates SHRAM address/lut index to LUT tensors
				106	# - Removes unnecessary DMA operations of LUT-s that are already present in SHRAM from sg's command stream
				107	cmd_stream = [] # will contain existing command stream minus unneeded DMA operations
				108	lut_state = LUTState()
				109	slot_size = 256
				110	lut_start = arch.shram_lut_address
				111	lut_end = lut_start + arch.shram_lut_size
				112	for cmd in sg.high_level_command_stream:
				113	if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.lut_tensor is None and arch.shram_reserved_unused_banks == 0:
				114	# The command overwrites the last 2 banks containing the LUT; next LUT operation will require DMA
				115	# TODO: check the command's SHRAM usage in more detail to determine if the LUT is overwritten or not
				116	lut_state = LUTState()
				117	if cmd.cmdtype != CommandType.DMA or cmd.out_tensor.purpose != TensorPurpose.LUT:
				118	# Non-LUT operation; leave untouched
				119	cmd_stream.append(cmd)
				120	continue
				121	# LUT DMA operation
				122	lut_tens = cmd.out_tensor
				123	existing_tens = lut_state.get_equivalent(lut_tens)
				124	if existing_tens is not None:
				125	# LUT is already in SHRAM, no need to perform DMA
				126	lut_tens.address = existing_tens.address
				127	cmd.ps.primary_op.attrs["lut_index"] = get_lut_index(arch, existing_tens)
				128	continue
				129	# Place the LUT in the last 2 blocks of SHRAM
				130	# Alignment is always on the size of the LUT, 256 for 256-byte LUT, 1K for 1K LUT, etc
				131	address = lut_state.find_best_address(lut_start, lut_end, lut_tens.storage_size())
				132	lut_tens.address = address
				133	cmd.ps.primary_op.attrs["lut_index"] = (address - lut_start) // slot_size
				134	lut_state = lut_state.put(lut_tens)
				135	cmd_stream.append(cmd)
				136	sg.high_level_command_stream = cmd_stream