| // |
| // This confidential and proprietary software may be used only as |
| // authorised by a licensing agreement from ARM Limited |
| // (C) COPYRIGHT 2020-2022 ARM Limited |
| // ALL RIGHTS RESERVED |
| // The entire notice above must be reproduced on all authorised |
| // copies and copies may only be made to the extent permitted |
| // by a licensing agreement from ARM Limited. |
| |
| === Tensor Operators |
| |
| ==== ARGMAX |
| |
| This returns the index with the largest value across the given axis of the input tensor. |
| |
| include::{generated}/operators/ARGMAX.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(axis < 0 || axis >= rank(shape1)); |
| if (axis == 0) { |
| left_shape = []; |
| } else { |
| left_shape = shape1[0:axis - 1]; |
| } |
| if (axis == rank(shape1)-1) { |
| right_shape = []; |
| } else { |
| right_shape = shape1[axis+1:rank(shape1) - 1]; |
| } |
| ERROR_IF(flatten(left_shape, right_shape) != shape); |
| for_each(left_index in left_shape) { |
| for_each(right_index in right_shape) { |
| in_t max_value = minimum_value<in_t>; |
| out_t max_index = 0; |
| for (i = 0; i < shape[axis]; i++) { |
| dim_t index = flatten(left_index, [i], right_index); |
| in_t value = tensor_read<in_t>(input, shape1, index); |
| if (value > max_value) { max_value = value; max_index = i; } |
| } |
| dim_t index = flatten(left_index, right_index); |
| tensor_write<out_t>(output, shape, index, max_index); |
| } |
| } |
| ---- |
| |
| ==== AVG_POOL2D |
| |
| This performs an average pooling over the given input tensor. |
| A sliding window of size given by <kernel size> is passed over the input tensor, with the mean value being placed in the output tensor. |
| When calculating the average, only the number of valid input tensor values, but not padding, are used to calculate the divisor. |
| |
| include::{generated}/operators/AVG_POOL2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_out_t != int8_t && input_zp != 0); // Zero point only for int8_t |
| ERROR_IF(in_out_t != int8_t && output_zp != 0); // Zero point only for int8_t |
| ERROR_IF(kernel_y < 1 || kernel_x < 1); // kernel size must be >= 1 |
| ERROR_IF(stride_y < 1 || stride_x < 1); |
| ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0); |
| // Padding must be less than kernel size to avoid |
| // a divide-by-zero. |
| ERROR_IF(pad_right >= kernel_x || pad_left >= kernel_x); |
| ERROR_IF(pad_top >= kernel_y || pad_bottom >= kernel_y); |
| ERROR_IF(OH != idiv_check(IH + pad_top + pad_bottom - kernel_y, stride_y) + 1); |
| ERROR_IF(OW != idiv_check(IW + pad_left + pad_right - kernel_x, stride_x) + 1); |
| |
| for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW, 0 <= c < C ) { |
| in_out_t output_val; |
| acc_t acc = 0; |
| int count = 0; |
| index_t iy = oy * stride_y - pad_top; |
| index_t ix = ox * stride_x - pad_left; |
| for_each(0 <= ky < kernel_y, 0 <= kx < kernel_x) { |
| index_t y = iy + ky; |
| index_t x = ix + kx; |
| // Only values from the input tensor are used to calculate the |
| // average, padding does not count |
| if (0 <= y < IH and 0 <= x < IW) { |
| count++; |
| acc_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]); |
| value = value - input_zp; |
| acc = apply_add<acc_t>(acc, value); |
| } |
| } |
| if (is_float(in_out_t)) { |
| output_val = acc / (float)count; |
| } else { |
| scale_t scale = reciprocal_scale(count); |
| acc = apply_scale_32(acc, scale.multiplier, scale.shift, false); |
| output_val = (in_out_t)apply_clip<acc_t>(acc + output_zp, minimum<in_out_t>, maximum<in_out_t>) |
| } |
| tensor_write<in_out_t>(output, [N,OH,OW,C], [n,oy,ox,c], output_val); |
| } |
| ---- |
| |
| ==== CONV2D |
| |
| Performs a 2D convolution over the given tensor input, using the weight tensor. |
| |
| include::{generated}/operators/CONV2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t |
| ERROR_IF(weight_t != int8_t && weight_zp != 0); |
| ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0); |
| ERROR_IF(stride_y < 1 || stride_x < 1); |
| ERROR_IF(dilation_y < 1 || dilation_x < 1); |
| ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1); |
| ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1); |
| ERROR_IF(BC != OC && BC != 1); |
| |
| for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) { |
| out_t acc = 0; |
| index_t iy = oy * stride_y - pad_top; |
| index_t ix = ox * stride_x - pad_left; |
| for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) { |
| index_t y = iy + ky * dilation_y; |
| index_t x = ix + kx * dilation_x; |
| if (0 <= y < IH && 0 <= x < IW) { |
| out_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]); |
| out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]); |
| value = value - input_zp; |
| weight = weight - weight_zp; |
| acc = apply_add<out_t>(acc, value * weight); |
| } |
| } |
| acc = apply_add<out_t>(acc, bias[(BC == 1) ? 0 : oc]); |
| tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc); |
| } |
| ---- |
| |
| ==== CONV3D |
| |
| Performs a 3D convolution over the given input tensor. |
| |
| include::{generated}/operators/CONV3D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t |
| ERROR_IF(weight_t != int8_t && weight_zp != 0); |
| ERROR_IF(pad_d0 < 0 || pad_d1 < 0 || pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0); |
| ERROR_IF(stride_d < 1 || stride_y < 1 || stride_x < 1); |
| ERROR_IF(dilation_d < 1 || dilation_y < 1 || dilation_x < 1); |
| ERROR_IF(OD != idiv_check(ID - 1 + pad_d0 + pad_d1 - (KD - 1) * dilation_d, stride_d) + 1); |
| ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1); |
| ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1); |
| ERROR_IF(BC != OC && BC != 1); |
| |
| for_each(0 <= n < N, 0 <= od < OD, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) { |
| out_t acc = 0; |
| index_t id = od * stride_d - pad_d0; |
| index_t iy = oy * stride_y - pad_top; |
| index_t ix = ox * stride_x - pad_left; |
| for_each(0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) { |
| index_t d = id + kd * dilation_d; |
| index_t y = iy + ky * dilation_y; |
| index_t x = ix + kx * dilation_x; |
| if (0 <= x < IW && 0 <= y < IH && 0 <= d < ID) { |
| out_t value = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic]); |
| out_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic]); |
| value = value - input_zp; |
| weight = weight - weight_zp; |
| acc = apply_add<out_t>(acc, value * weight); |
| } |
| } |
| acc = apply_add<out_t>(acc, bias[(BC == 1) ? 0 : oc]); |
| tensor_write<out_t>(output, [N,OD,OH,OW,OC], [n,od,oy,ox,oc], acc); |
| } |
| ---- |
| |
| ==== DEPTHWISE_CONV2D |
| |
| Performs 2D convolutions separately over each channel of the given tensor input, using the weight tensor. |
| |
| include::{generated}/operators/DEPTHWISE_CONV2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t |
| ERROR_IF(weight_t != int8_t && weight_zp != 0); |
| ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0); |
| ERROR_IF(stride_y < 1 || stride_x < 1); |
| ERROR_IF(dilation_y < 1 || dilation_x < 1); |
| ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1); |
| ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1); |
| ERROR_IF(BC != C*M && BC != 1); |
| |
| for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C, 0 <= m < M) { |
| out_t acc = 0; |
| index_t iy = oy * stride_y - pad_top; |
| index_t ix = ox * stride_x - pad_left; |
| for_each(0 <= ky < KH, 0 <= kx < KW) { |
| index_t y = iy + ky * dilation_y; |
| index_t x = ix + kx * dilation_x; |
| if (0 <= y < IH && 0 <= x < IW) { |
| out_t value = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c]); |
| out_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m]); |
| value = value - input_zp; |
| weight = weight - weight_zp; |
| acc = apply_add<out_t>(acc, value * weight); |
| } |
| } |
| acc = apply_add<out_t>(acc, bias[(BC == 1) ? 0 : (c * M) + m]); |
| tensor_write<out_t>(output, [N,OH,OW,C * M], [n,oy,ox,c * M + m], acc); |
| } |
| ---- |
| |
| ==== FFT2D |
| |
| Performs a batched complex 2D Fast Fourier Transform over the input. |
| The complex input values are constructed from the corresponding values in the input_real and input_imag tensors. |
| The resulting values in the output are split into the output_real and output_imag tensors. |
| No normalization is applied on either the forward or inverse versions of the operation. |
| |
| // output[h][w] = \sum_{m=0}^{H-1}\sum_{n=0}^{W-1}input[m][n] * \exp\left(-2\pi i\left(\frac{mh}{H} + \frac{nw}{W}\right)\right) |
| |
| .Calculation for the forward FFT2D calculation (inverse=false) |
| image::forward_fft2d.svg["forward FFT definition", align="center"] |
| |
| // output[h][w] = \sum_{m=0}^{H-1}\sum_{n=0}^{W-1}input[m][n] * \exp\left(2\pi i\left(\frac{mh}{H} + \frac{nw}{W}\right)\right) |
| |
| .Calculation for the inverse FFT2D calculation (inverse=true) |
| image::inverse_fft2d.svg["inverse FFT definition", align="center"] |
| |
| include::{generated}/operators/FFT2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(!power_of_two(H)); |
| ERROR_IF(!power_of_two(W)); |
| |
| float sign_val = 1.0; |
| |
| if (inverse) { |
| sign_val = -1.0; |
| } |
| |
| for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W) { |
| in_out_t sum_real = 0.0; |
| in_out_t sum_imag = 0.0; |
| for_each(0 <= iy < H, 0 <= ix < W) { |
| in_out_t val_real = tensor_read<in_out_t>(input_real, [N,H,W], [n,iy,ix]); |
| in_out_t val_imag = tensor_read<in_out_t>(input_imag, [N,H,W], [n,iy,ix]); |
| float_t a = sign_val * 2 * pi() * ((iy * oy) / H + (ix * ox) / W); |
| sum_real += val_real * cos(a) + val_imag * sin(a); |
| sum_imag += -val_real * sin(a) + val_imag * cos(a); |
| } |
| tensor_write<in_out_t>(output_real, [N,H,W], [n,oy,ox], sum_real); |
| tensor_write<in_out_t>(output_imag, [N,H,W], [n,oy,ox], sum_imag); |
| } |
| ---- |
| |
| ==== FULLY_CONNECTED |
| |
| Performs a fully connected network. |
| |
| include::{generated}/operators/FULLY_CONNECTED.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t |
| ERROR_IF(weight_t != int8_t && weight_zp != 0); |
| ERROR_IF(BC != OC && BC != 1); |
| |
| for_each(0 <= n < N, 0 <= oc < OC) { |
| out_t acc = 0; |
| for_each(0 <= ic < IC) { |
| out_t value = tensor_read<in_t>(input, [N,IC], [n,ic]); |
| out_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic]); |
| value = value - input_zp; |
| weight = weight - weight_zp; |
| acc = apply_add<out_t>(acc, value * weight); |
| } |
| acc = apply_add<out_t>(acc, bias[(BC == 1) ? 0 : oc]); |
| tensor_write<out_t>(output, [N,OC], [n,oc], acc); |
| } |
| ---- |
| |
| ==== MATMUL |
| |
| Performs two dimensional matrix multiplications. This allows both inputs to be activations, rather than reserving weights as an attribute in the FULLY_CONNECTED operator. |
| |
| include::{generated}/operators/MATMUL.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && (A_zp != 0 || B_zp != 0)); // Zero point only for int8_t |
| for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) { |
| out_t acc = 0; |
| for_each(0 <= c < C) { |
| out_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c]); |
| out_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w]); |
| value1 = value1 - A_zp; |
| value2 = value2 - B_zp; |
| acc = apply_add<out_t>(acc, value1 * value2); |
| } |
| tensor_write<out_t>(output, [N,H,W], [n,h,w], acc); |
| } |
| ---- |
| |
| ==== MAX_POOL2D |
| |
| This performs a max pooling over the given input tensor. A sliding window of size given by <kernel size> is passed over the input tensor, with the maximum value being placed in the output tensor. |
| |
| include::{generated}/operators/MAX_POOL2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(kernel_y < 1 || kernel_x < 1); // kernel size must be >= 1 |
| ERROR_IF(stride_y < 1 || stride_x < 1); |
| ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0); |
| // Padding must be less than kernel size, otherwise no |
| // input values will be used. |
| ERROR_IF(pad_right >= kernel_x || pad_left >= kernel_x); |
| ERROR_IF(pad_top >= kernel_y || pad_bottom >= kernel_y); |
| ERROR_IF(OH != idiv_check(IH + pad_top + pad_bottom - kernel_y, stride_y) + 1); |
| ERROR_IF(OW != idiv_check(IW + pad_left + pad_right - kernel_x, stride_x) + 1); |
| |
| for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) { |
| in_out_t acc = minimum_value<in_out_t>; |
| index_t iy = oy * stride_y - pad_top; |
| index_t ix = ox * stride_x - pad_left; |
| for_each( 0 <= ky < kernel_y, 0 <= kx < kernel_x ) { |
| index_t y = iy + ky; |
| index_t x = ix + kx; |
| if (y >= 0 && y < IH && x >= 0 && x < IW) { |
| in_out_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]); |
| acc = apply_max(acc, value); |
| } |
| } |
| tensor_write<in_out_t>(output, [N,OH,OW,C], [n,oy,ox,c], acc); |
| } |
| ---- |
| |
| ==== RFFT2D |
| |
| Performs a batched 2D real-valued Fast Fourier Transform over the input where the input tensor consists of real values producing complex valued output. |
| The complex output values will be split into the output_real and output_imag tensor arguments. |
| RFFT2D takes advantage of Hermitian symmetry to only calculate the first half of the final output axis. |
| Imaginary values with locations (0,0), (0,W/2), (H/2,0) and (H/2,W/2) are zero. |
| |
| // output[h][w] = \sum_{m=0}^{H-1}\sum_{n=0}^{W-1}input[m][n] * \exp\left(-2\pi i\left(\frac{mh}{H} + \frac{nw}{W}\right)\right) |
| |
| image::forward_fft2d.svg["forward RFFT definition", align="center"] |
| |
| include::{generated}/operators/RFFT2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(!power_of_two(H)); |
| ERROR_IF(!power_of_two(W)); |
| |
| for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W/2 + 1) { |
| in_out_t sum_real = 0.0; |
| in_out_t sum_imag = 0.0; |
| for_each(0 <= iy < H, 0 <= ix < W) { |
| in_out_t val_real = tensor_read<in_out_t>(input_real, [N,H,W], [n,iy,ix]); |
| float_t a = 2 * pi() * ((iy * oy) / H + (ix * ox) / W); |
| sum_real += val_real * cos(a); |
| sum_imag += -val_real * sin(a); |
| } |
| tensor_write<in_out_t>(output_real, [N,H,W], [n,oy,ox], sum_real); |
| tensor_write<in_out_t>(output_imag, [N,H,W], [n,oy,ox], sum_imag); |
| } |
| ---- |
| |
| ==== TRANSPOSE_CONV2D |
| |
| Performs a 2D transposed convolution over the given tensor input, using the weights tensor. |
| |
| include::{generated}/operators/TRANSPOSE_CONV2D.adoc[] |
| |
| [source,c++] |
| ---- |
| ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only allowed for int8_t |
| ERROR_IF(weight_t != int8_t && weight_zp != 0); |
| ERROR_IF(out_pad_top <= -KH || out_pad_bottom <= -KH); |
| ERROR_IF(out_pad_left <= -KW || out_pad_right <= -KW); |
| ERROR_IF(stride_y < 1 || stride_x < 1); |
| ERROR_IF(OH != (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH); |
| ERROR_IF(OW != (IW - 1) * stride_x + out_pad_left + out_pad_right + KW); |
| ERROR_IF(BC != OC && BC != 1); |
| |
| for_each(index in out_shape) { |
| tensor_write<out_t>(output, [N,OH,OW,OC], index, bias[(BC == 1) ? 0 : index[3]]) |
| } |
| for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC, |
| 0 <= ic < IC, 0 <= ky < KH, 0 <= kx < KW) { |
| index_t oy = iy * stride_y + out_pad_top + ky; |
| index_t ox = ix * stride_x + out_pad_left + kx; |
| if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) { |
| out_t acc = tensor_read<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]); |
| out_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]); |
| out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]); |
| value = value - input_zp; |
| weight = weight - weight_zp; |
| acc = apply_add<out_t>(acc, value * weight); |
| tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc); |
| } |
| } |
| ---- |