blob: 56d68d1371874022aa9191c2f1ed067c7212908c [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Insert DMA operations into the graph for transfering weights.
Tim Hall79d07d22020-04-27 18:20:16 +010018from . import rewrite_graph
Diego Russoe8a10452020-04-21 17:39:10 +010019from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020020from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010021from .operation import Operation
22from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020023from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010024from .tensor import TensorPurpose
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020025from .weight_compressor import compress_weights
Tim Hall79d07d22020-04-27 18:20:16 +010026
Tim Hallc30f4952020-06-15 20:47:35 +010027
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020028def weights_fit_sram(arch, op, tens, nng):
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020029 if tens.purpose != TensorPurpose.Weights:
30 return True
31
32 min_weight_size = 0
33 if len(tens.shape) == 4:
34 min_weight_size = tens.shape[0] * tens.shape[1] * tens.shape[2] * arch.OFMSplitDepth
35 elif len(tens.shape) == 2:
36 min_weight_size = tens.shape[0] * arch.OFMSplitDepth
37
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020038 # Need to be fit into Sram, as a double buffer
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020039 # Only evaluate when the compression test limit will make it impossible to fit
40 w_comp_test_limit = 2
41 if (w_comp_test_limit * min_weight_size * 2) > arch.sram_size:
42 # check worst compression ratio
43 npu_block_type = op.attrs.get("npu_block_type", NpuBlockType.Default)
44 compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w())
45
46 worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2
47 if worst_buffer_size > arch.sram_size:
48 print(
49 "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format(
50 tens.name, worst_buffer_size
51 )
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020052 )
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020053 return False
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020054 return True
55
56
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020057def insert_dma_cmd(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +020058 if op.type == Op.DMA or not op.run_on_npu:
Jacob Bohlin68a04b12020-07-13 11:39:36 +020059 return op
60
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020061 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs)
62 max_ifm_shram_avail = (
63 (arch.available_shram_banks(is_lut_used) - arch.shram_reserved_output_banks) * arch.shram_bank_size // 2
64 )
Diqing Zhong55d9e332020-09-11 10:05:22 +020065
Tim Hall79d07d22020-04-27 18:20:16 +010066 for idx, tens in enumerate(op.inputs):
67
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020068 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
69 # Tensor is in permanent storage
70 # Only when permanent storage differs from fast storage, there is a point moving the data
Fredrik Svedberga0c36242020-06-03 15:43:31 +020071 if (
72 tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
73 and arch.permanent_storage_mem_area != arch.fast_storage_mem_area
74 ) or tens.purpose == TensorPurpose.LUT:
75 if tens.purpose in (TensorPurpose.Weights, TensorPurpose.LUT) or (
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020076 tens.purpose == TensorPurpose.FeatureMap
Louis Verhaardaee5d752020-09-30 09:01:52 +020077 and op.type.is_binary_elementwise_op()
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020078 and tens.shape != []
79 and tens.shape != op.outputs[0].shape
80 and tens.storage_size() > max_ifm_shram_avail
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020081 ):
82 only_vector_product_consumers = True
83 for oper in tens.consumers():
Louis Verhaardaee5d752020-09-30 09:01:52 +020084 if oper is None or oper.type.npu_block_type != NpuBlockType.VectorProduct:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020085 only_vector_product_consumers = False
86 break
Tim Hall79d07d22020-04-27 18:20:16 +010087
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020088 # Tensor products has no need for DMA, tensors are only read once and can be in flash.
89 # Other operations re-reads tensors, this is better done from SRAM.
Fredrik Svedberga0c36242020-06-03 15:43:31 +020090 # LUTs must be placed in the last 2 blocks of SHRAM.
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020091 if (
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020092 not only_vector_product_consumers and weights_fit_sram(arch, op, tens, nng)
Patrik Gustavssonfa34c6f2020-08-17 12:43:22 +020093 ) or tens.purpose == TensorPurpose.LUT:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020094 # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
95 new_tens = tens.clone_into_fast_storage(arch)
Louis Verhaardaee5d752020-09-30 09:01:52 +020096 dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020097 dma_cmd.inputs = [tens]
Michael McGeaghc5b549b2020-08-07 11:54:28 +010098 dma_cmd.set_output_tensor(new_tens)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020099 dma_cmd.attrs["source"] = tens.mem_area
100 dma_cmd.attrs["destination"] = new_tens.mem_area
101 dma_cmd.run_on_npu = True
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200102 if tens.purpose == TensorPurpose.LUT:
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200103 new_tens.mem_area = MemArea.Shram
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200104 op.inputs[idx] = new_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100105 return op
106
107
108def insert_dma_commands(nng, arch, verbose_graph=False):
109
110 for idx, sg in enumerate(nng.subgraphs):
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200111 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [insert_dma_cmd])
Tim Hall79d07d22020-04-27 18:20:16 +0100112 if verbose_graph:
113 nng.print_graph()
114 return nng