Blame - ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

17

# Generate a high-level command stream from a schedule

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

18

from .high_level_command_stream import Box

19

from .high_level_command_stream import DMA

20

from .high_level_command_stream import NpuStripe

Charles Xu

89a6bbf

2020-08-11 12:31:58 +0200

[diff] [blame]

21

from .numeric_util import round_up_divide

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame]

22

from .operation import create_activation_function

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

23

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

24

from .operation import Op

patrik.gustavsson

eeb8515

2020-12-21 17:10:40 +0000

[diff] [blame]

25

from .shape4d import Shape4D

Charles Xu

7879222

2020-05-13 10:15:26 +0200

[diff] [blame]

26

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

27

28

Charles Xu

7879222

2020-05-13 10:15:26 +0200

[diff] [blame]

29

def dma_if_necessary(ps, box, tensor):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

30

src_tensor = tensor.src_tensor

31

if src_tensor and tensor.mem_area != src_tensor.mem_area:

32

yield DMA(ps, src_tensor, tensor, box)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

33

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

34

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

35

def generate_high_level_command_stream_for_schedule(nng, sg, arch, verbose_high_level_command_stream):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

36

res = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

37

# sg.sched_ops are ordered by execution

38

processed_cascades = set()

39

for sched_op in sg.sched_ops:

40

op_info = sg.schedule.cost_map[sched_op]

41

if op_info.cascade in processed_cascades:

42

# This cascade has already been processed

43

continue

44

45

if op_info.cascade == 0:

46

# Generate high-level commands for this Op in isolation

47

res += list(generate_high_level_commands_for_sched_op(sched_op, sg.schedule))

48

else:

49

# Generate high-level commands for the whole cascade

50

cascade_info = sg.schedule.cascades[op_info.cascade]

51

# Start from the last Op in the cascade

52

res += list(generate_high_level_commands_for_sched_op(sg.sched_ops[cascade_info.end], sg.schedule))

53

processed_cascades.add(op_info.cascade)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

54

55

sg.high_level_command_stream = res

56

if verbose_high_level_command_stream:

57

sg.print_high_level_command_stream()

58

59

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

60

def generate_high_level_commands_for_sched_op(sched_op, schedule):

61

op_info = schedule.cost_map[sched_op]

62

cascade_info = schedule.cascades.get(op_info.cascade)

63

npu_block_type = sched_op.parent_ps.npu_block_type

64

block_config = op_info.block_config

65

ps = sched_op.parent_ps

66

parent_op = sched_op.parent_op

67

ofm_tensor = ps.ofm_tensor

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

68

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

69

# Get Tensors and Full Shapes

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

(

ifm_tensor,

ifm2_tensor,

uncomp_weight_tensor,

74

_,

75

_,

76

) = parent_op.get_ifm_ifm2_weights_biases_ofm()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

77

ifm = sched_op.ifm

78

ifm2 = sched_op.ifm2

79

ofm_shape = sched_op.ofm.shape

80

81

# Get Kernel strides and upscaling factor

82

kernel_stride = sched_op.kernel.stride

83

strides = [1, kernel_stride.y, kernel_stride.x, 1]

84

skirt = parent_op.attrs.get("skirt", None)

85

upscaling = 1

86

if sched_op.op_type == Op.Conv2DBackpropInputSwitchedBias:

87

upscaling = ofm_shape.height // ifm.shape.height

88

elif sched_op.op_type == Op.ResizeBilinear:

89

upscaling = round_up_divide(ofm_shape.height, ifm.shape.height)

90

Rickard Bolin

2022-01-07 14:22:52 +0000

[diff] [blame]

91

# Get kernel height and height dilation

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

92

k_height = 1

93

if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):

94

if parent_op is not None:

95

k_height = parent_op.attrs["ksize"][1]

96

else:

97

if uncomp_weight_tensor is not None:

98

k_height = uncomp_weight_tensor.shape[0]

99

Rickard Bolin

2022-01-07 14:22:52 +0000

[diff] [blame]

100

k_height_dilation = parent_op.attrs.get("dilation", (_, 1, _, _))[-3]

101

102

# Calculate dilated kernel height

103

k_dilated_height = k_height_dilation * (k_height - 1) + 1

104

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

105

# Define Start and End coordinates for the OFM

106

ofm_start = Shape4D(0, 0, 0, op_info.ofm_depth_slices[0])

107

ofm_end = ofm_shape

108

109

ofm_depth_slices = op_info.ofm_depth_slices

110

111

# Read/Write offsets

112

read_offsets = list(parent_op.read_offsets) # offset for [ifm, ifm2]

113

read_shapes = list(parent_op.read_shapes) # read shapes for [ifm, ifm2]

114

write_offset = Shape4D(0, 0, 0, 0)

115

if parent_op.write_offset is not None:

116

write_offset = parent_op.write_offset

117

ofm_start = write_offset

118

ofm_end = parent_op.write_offset + parent_op.write_shape

119

120

# Create activation function if needed

121

for op in ps.ops:

122

if op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):

Patrik Gustavsson

8f1f9aa

2021-06-28 07:41:58 +0200

[diff] [blame]

123

ps.primary_op.activation = create_activation_function(

124

op.type, min=op.attrs.get("min", None), max=op.attrs.get("max", None)

125

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

126

127

# Generate commands for the Op that produces this Op's IFM, if applicable

128

if cascade_info is None or cascade_info.start == sched_op.index:

129

# Lone Op or First Op in cascade - all IFM data is present

130

ifm_present = Box([0, 0, 0, 0], ifm.shape.as_list())

producer_op = None

prev_cmd_gen = []

else:

ifm_present = Box([0, 0, 0, 0], [0, 0, 0, 0])

135

producer_op = sched_op.ifm.connection.producers[0]

136

prev_cmd_gen = generate_high_level_commands_for_sched_op(producer_op, schedule)

137

138

ofm_step = op_info.stripe

139

for start_height in range(ofm_start.height, ofm_end.height, ofm_step.height):

140

end_height = min(start_height + ofm_step.height, ofm_end.height)

141

for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width):

142

end_width = min(start_width + ofm_step.width, ofm_end.width)

143

Dwight Lidman

8f78ac2

2021-08-13 14:04:30 +0200

[diff] [blame]

144

lut_dma_done = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

145

for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]):

146

start_channel = max(start_channel, ofm_start.depth)

147

end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth)

148

149

# Construct the OFM box for the current stripe

150

ofm_box_start = Shape4D(ofm_start.batch, start_height, start_width, start_channel)

151

ofm_box_end = Shape4D(ofm_end.batch, end_height, end_width, end_channel)

152

ofm_box = Box(ofm_box_start.as_list(), ofm_box_end.as_list())

153

ifm_box = Box([], [])

154

ifm2_box = Box([], [])

155

156

# Calculate IFM input box based on the OFM box

157

if ifm:

158

ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(

strides,

skirt,

ifm.shape,

npu_block_type,

write_offset.as_list(),

Rickard Bolin

2022-01-07 14:22:52 +0000

[diff] [blame]

164

k_dilated_height,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

165

read_offsets[0],

166

read_shapes[0],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

upscaling,

)

# Calculate IFM2 input box based on the OFM box

171

if ifm2:

172

ifm2_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(

strides,

skirt,

ifm2.shape,

npu_block_type,

write_offset.as_list(),

Rickard Bolin

2022-01-07 14:22:52 +0000

[diff] [blame]

178

k_dilated_height,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

179

read_offsets[1],

180

read_shapes[1],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

upscaling,

)

ifm_required = ifm_box

185

# Get the Op that produces this Op's IFM data - only applicable within cascades

186

if producer_op:

187

assert op_info.cascade != 0

188

assert op_info.cascade == schedule.cost_map[producer_op].cascade

189

for prev_cmd in prev_cmd_gen:

190

yield prev_cmd

191

if prev_cmd.is_npu_pass_command() and prev_cmd.ps == producer_op.parent_ps:

192

ifm_present.end_coord = prev_cmd.ofm_box.end_coord

193

if ifm_required.is_subbox_of(ifm_present):

194

# There is enough IFM data - exit loop

195

break

196

197

# Information about the current stripe's location in the cascade

198

is_first_h_stripe = ofm_box_start.height == ofm_start.height

199

is_last_h_stripe = ofm_box_end.height >= ofm_end.height

200

201

# Calculate the weight box - i.e. the subshape of weights needed for this NpuStripe command

202

weight_tensor = op_info.npu_weights_tensor

Tim Hall

d784af7

2021-06-08 21:25:57 +0100

[diff] [blame]

203

scale_tensor = op_info.npu_scales_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

204

if op_info.npu_weights_tensor:

205

weight_box = Box([0, 0, 0, start_channel], [1, 1, 1, end_channel])

206

Rickard Bolin

fd8b500

2022-05-16 09:11:06 +0000

[diff] [blame]

207

if op_info.buffered_weight_tensors and is_first_h_stripe:

208

idx = depth_idx % len(op_info.buffered_weight_tensors)

209

yield from dma_if_necessary(

210

sched_op.parent_ps, weight_box, op_info.buffered_weight_tensors[idx]

211

)

212

weight_tensor = op_info.buffered_weight_tensors[idx]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

else:

weight_box = None

Dwight Lidman

2021-08-13 14:04:30 +0200

[diff] [blame]

216

# Should only be done once per loop but not before weights above

217

if parent_op.activation_lut and not lut_dma_done:

218

lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]

219

lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))

220

lut_dma_done = True

221

yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)

222

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

223

yield NpuStripe(

224

sched_op.parent_ps,

225

block_config.old_style_representation(),

is_first_h_stripe,

is_last_h_stripe,

ifm_tensor,

ifm_box,

ofm_tensor,

ofm_box,

weight_tensor,

weight_box,

Tim Hall

d784af7

2021-06-08 21:25:57 +0100

[diff] [blame]

234

scale_tensor,

Tim Hall