ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2017-2021 Arm Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #include "gemm_helpers.h" |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 25 | #include "helpers.h" |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 26 | #include "repeat.h" |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 27 | #include "tile_helpers.h" |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 28 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 29 | #if defined(RESHAPE_LHS_NT) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 30 | /** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in |
| 31 | * the output matrix unrolling the values. |
| 32 | * |
| 33 | * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) |
| 34 | * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16) |
| 35 | * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16) |
| 36 | * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2). |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 37 | * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1) |
| 38 | * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 39 | * @note Only the following values for M0, K0 and V0 are supported: |
| 40 | * M0: 2,3,4,5,6,7,8 |
| 41 | * K0: 2,3,4,8,16 |
| 42 | * V0: greater than 0 |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 43 | * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time. |
| 44 | * |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 45 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: All |
| 46 | * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) |
| 47 | * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) |
| 48 | * @param[in] src_w The size of the width dimension of the source tensor |
| 49 | * @param[in] src_h The size of the height dimension of the source tensor |
| 50 | * @param[in] src_n The size of the depth dimension of the source tensor |
| 51 | * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor |
| 52 | * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All |
| 53 | * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) |
| 54 | * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) |
| 55 | * @param[in] dst_w The size of the width dimension of the destination tensor |
| 56 | * @param[in] dst_h The size of the height dimension of the destination tensor |
| 57 | * @param[in] dst_n The size of the depth dimension of the destination tensor |
| 58 | * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor |
| 59 | * @param[in] M The size of height dimension of the source tensor, affected by reinterpret_input_as_3d |
| 60 | * @param[in] V0 The number of blocks to place on the same row. It must be greater than 0. |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 61 | */ |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 62 | __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER), |
| 63 | TENSOR3D_T(dst, BUFFER), |
| 64 | const int M, |
| 65 | const int V0) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 66 | { |
| 67 | // Block size |
| 68 | #define BLOCK_SIZE ((M0) * (K0)) |
| 69 | |
| 70 | // Output offset X |
| 71 | #if defined(INTERLEAVE) |
| 72 | #define OUTPUT_OFFSET_X (K0) |
| 73 | #else // defined(INTERLEAVE) |
| 74 | #define OUTPUT_OFFSET_X (BLOCK_SIZE) |
| 75 | #endif // defined(INTERLEAVE) |
| 76 | |
| 77 | // Output step X |
| 78 | #if defined(INTERLEAVE) |
| 79 | #define OUTPUT_STEP_X (K0) * (V0) |
| 80 | #else // Do not interleave |
| 81 | #define OUTPUT_STEP_X (K0) |
| 82 | #endif // defined(INTERLEAVE) |
| 83 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 84 | const int x = GET_SPATIAL_IDX(0, 1, 0); // K |
| 85 | const int y = GET_SPATIAL_IDX(1, 1, 0); // M |
| 86 | const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 87 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 88 | const int xi = x * K0; |
| 89 | const int yi = y * M0; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 90 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 91 | const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X; |
| 92 | const int yo = (y / V0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 93 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 94 | // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true |
| 95 | src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y; |
| 96 | dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 97 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 98 | TILE(DATA_TYPE, M0, K0, in); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 99 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 100 | // Initialize the input tile to zero |
| 101 | LOOP_UNROLLING(int, _i, 0, 1, M0, |
| 102 | { |
| 103 | in[_i].v = 0; |
| 104 | }); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 105 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 106 | bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0); |
| 107 | bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0); |
| 108 | // Load input tile |
| 109 | TILE(uint, M0, 1, in_indirect_y); |
| 110 | LOOP_UNROLLING(int, _i, 0, 1, M0, |
| 111 | { |
| 112 | in_indirect_y[_i].v = _i; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 113 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 114 | }); |
| 115 | #if PARTIAL_M0 != 0 |
| 116 | if(y_cond) |
| 117 | { |
| 118 | T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); |
| 119 | } |
| 120 | else |
| 121 | #endif // PARTIAL_M0 != 0 |
| 122 | { |
| 123 | T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); |
| 124 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 125 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 126 | // Store output tile |
| 127 | TILE(uint, M0, 1, dst_indirect_y); |
| 128 | LOOP_UNROLLING(int, _i, 0, 1, M0, |
| 129 | { |
| 130 | dst_indirect_y[_i].v = _i; |
| 131 | }); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 132 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 133 | T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 134 | #undef BLOCK_SIZE |
| 135 | #undef OUTPUT_OFFSET_X |
| 136 | #undef OUTPUT_STEP_X |
| 137 | } |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 138 | #endif // defined(RESHAPE_LHS_NT) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 139 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 140 | #if defined(RESHAPE_LHS_T) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 141 | /** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in |
| 142 | * the output matrix unrolling the values. |
| 143 | * |
| 144 | * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) |
| 145 | * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16) |
| 146 | * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16) |
| 147 | * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2). |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 148 | * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1) |
| 149 | * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 150 | * @note Only the following values for M0, K0 and V0 are supported: |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 151 | * M0: 2,3,4,8,16 |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 152 | * K0: 2,3,4,8,16 |
| 153 | * V0: greater than 0 |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 154 | * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time. |
| 155 | * |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 156 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: All |
| 157 | * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) |
| 158 | * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) |
| 159 | * @param[in] src_w The size of the width dimension of the source tensor |
| 160 | * @param[in] src_h The size of the height dimension of the source tensor |
| 161 | * @param[in] src_n The size of the depth dimension of the source tensor |
| 162 | * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor |
| 163 | * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All |
| 164 | * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) |
| 165 | * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) |
| 166 | * @param[in] dst_w The size of the width dimension of the destination tensor |
| 167 | * @param[in] dst_h The size of the height dimension of the destination tensor |
| 168 | * @param[in] dst_n The size of the depth dimension of the destination tensor |
| 169 | * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor |
| 170 | * @param[in] M The size of height dimension of the source tensor, affected by reinterpret_input_as_3d |
| 171 | * @param[in] V0 The number of blocks to place on the same row. It must be greater than 0 |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 172 | */ |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 173 | __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER), |
| 174 | TENSOR3D_T(dst, BUFFER), |
| 175 | const int M, |
| 176 | const int V0) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 177 | { |
| 178 | // Block size |
| 179 | #define BLOCK_SIZE ((M0) * (K0)) |
| 180 | |
| 181 | // Output offset X |
| 182 | #if defined(INTERLEAVE) |
| 183 | #define OUTPUT_OFFSET_X (M0) |
| 184 | #else // defined(INTERLEAVE) |
| 185 | #define OUTPUT_OFFSET_X (BLOCK_SIZE) |
| 186 | #endif // defined(INTERLEAVE) |
| 187 | |
| 188 | // Output step X |
| 189 | #if defined(INTERLEAVE) |
| 190 | #define OUTPUT_STEP_X (M0) * (V0) |
| 191 | #else // Do not interleave |
| 192 | #define OUTPUT_STEP_X (M0) |
| 193 | #endif // defined(INTERLEAVE) |
| 194 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 195 | const int x = GET_SPATIAL_IDX(0, 1, 0); // K |
| 196 | const int y = GET_SPATIAL_IDX(1, 1, 0); // M |
| 197 | const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 198 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 199 | const int xi = x * K0; |
| 200 | const int yi = y * M0; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 201 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 202 | const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X); |
| 203 | const int yo = (y / V0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 204 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 205 | // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true |
| 206 | src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y; |
| 207 | dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 208 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 209 | TILE(DATA_TYPE, M0, K0, in); |
| 210 | TILE(DATA_TYPE, K0, M0, in_tr); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 211 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 212 | // Initialize the tile to zero |
| 213 | LOOP_UNROLLING(int, _i, 0, 1, M0, |
| 214 | { |
| 215 | in[_i].v = 0; |
| 216 | }); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 217 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 218 | // Load input tile |
| 219 | bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0); |
| 220 | bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 221 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 222 | TILE(uint, M0, 1, in_indirect_y); |
| 223 | LOOP_UNROLLING(int, _i, 0, 1, M0, |
| 224 | { |
| 225 | in_indirect_y[_i].v = _i; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 226 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 227 | }); |
| 228 | #if PARTIAL_M0 != 0 |
| 229 | if(y_cond) |
| 230 | { |
| 231 | T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); |
| 232 | } |
| 233 | else |
| 234 | #endif // PARTIAL_M0 != 0 |
| 235 | { |
| 236 | T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y); |
| 237 | } |
| 238 | // Transpose input tile |
| 239 | LOOP_UNROLLING(int, m0, 0, 1, M0, |
| 240 | { |
| 241 | LOOP_UNROLLING(int, k0, 0, 1, K0, |
| 242 | { |
| 243 | in_tr[k0].s[m0] = in[m0].s[k0]; |
| 244 | }) |
| 245 | }); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 246 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 247 | TILE(uint, K0, 1, dst_indirect_y); |
| 248 | LOOP_UNROLLING(int, _i, 0, 1, K0, |
| 249 | { |
| 250 | dst_indirect_y[_i].v = _i; |
| 251 | }); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 252 | |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 253 | // Store output tile |
| 254 | T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 255 | |
| 256 | #undef BLOCK_SIZE |
| 257 | #undef OUTPUT_OFFSET_X |
| 258 | #undef OUTPUT_STEP_X |
| 259 | } |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 260 | #endif // defined(RESHAPE_LHS_T) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 261 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 262 | #if defined(RESHAPE_RHS_NT) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 263 | /** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in |
| 264 | * the output matrix unrolling the values. |
| 265 | * |
| 266 | * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 267 | * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2). |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 268 | * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time. |
| 269 | * @note Only the following values for K0, N0 and H0 are supported: |
| 270 | * N0: 2,3,4,8,16 |
| 271 | * K0: 1,2,3,4,8,16 |
| 272 | * H0: greater than 0 |
| 273 | * |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 274 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: All |
| 275 | * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) |
| 276 | * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) |
| 277 | * @param[in] src_w The size of the width dimension of the source tensor |
| 278 | * @param[in] src_h The size of the height dimension of the source tensor |
| 279 | * @param[in] src_n The size of the depth dimension of the source tensor |
| 280 | * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor |
| 281 | * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All |
| 282 | * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) |
| 283 | * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) |
| 284 | * @param[in] dst_w The size of the width dimension of the destination tensor |
| 285 | * @param[in] dst_h The size of the height dimension of the destination tensor |
| 286 | * @param[in] dst_n The size of the depth dimension of the destination tensor |
| 287 | * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor |
Adnan AlSinan | 3e155a5 | 2021-12-10 12:34:02 +0000 | [diff] [blame] | 288 | * @param[in] H0 The number of blocks to place on the same row. It must be greater than 0 |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 289 | */ |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 290 | __kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER), |
| 291 | TENSOR3D_T(dst, BUFFER), |
| 292 | const int H0) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 293 | { |
| 294 | // Block size |
| 295 | #define BLOCK_SIZE ((K0) * (N0)) |
| 296 | |
| 297 | // Output offset X |
| 298 | #if defined(INTERLEAVE) |
| 299 | #define OUTPUT_OFFSET_X (N0) |
| 300 | #else // defined(INTERLEAVE) |
| 301 | #define OUTPUT_OFFSET_X (BLOCK_SIZE) |
| 302 | #endif // defined(INTERLEAVE) |
| 303 | |
| 304 | // Output step X |
| 305 | #if defined(INTERLEAVE) |
| 306 | #define OUTPUT_STEP_X (N0) * (H0) |
| 307 | #else // Do not interleave |
| 308 | #define OUTPUT_STEP_X (N0) |
| 309 | #endif // defined(INTERLEAVE) |
| 310 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 311 | const int x = GET_SPATIAL_IDX(0, 1, 0); |
| 312 | const int y = GET_SPATIAL_IDX(1, 1, 0); |
| 313 | const int z = GET_SPATIAL_IDX(2, 1, 0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 314 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 315 | const int xi = x * N0; |
| 316 | const int yi = y * K0; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 317 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 318 | const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X; |
| 319 | const int yo = (x / H0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 320 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 321 | src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z; |
| 322 | dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 323 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 324 | TILE(DATA_TYPE, K0, N0, in); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 325 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 326 | // Initialize the tile to zero |
| 327 | for(int i = 0; i < K0; ++i) |
| 328 | { |
| 329 | in[i].v = 0; |
| 330 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 331 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 332 | // Load input tile |
| 333 | for(int i = 0; i < K0; ++i) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 334 | { |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 335 | if(yi + i < src_h) |
| 336 | { |
| 337 | in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y); |
| 338 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 339 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 340 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 341 | TILE(uint, K0, 1, dst_indirect_y); |
| 342 | for(int i = 0; i < K0; ++i) |
| 343 | { |
| 344 | dst_indirect_y[i].v = i; |
| 345 | } |
| 346 | |
| 347 | T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 348 | |
| 349 | #undef BLOCK_SIZE |
| 350 | #undef OUTPUT_OFFSET_X |
| 351 | #undef OUTPUT_STEP_X |
| 352 | } |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 353 | #endif // defined(RESHAPE_RHS_NT) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 354 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 355 | #if defined(RESHAPE_RHS_T) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 356 | /** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in |
| 357 | * the output matrix unrolling the values. |
| 358 | * |
| 359 | * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 360 | * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2). |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 361 | * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time. |
| 362 | * @note The option -DTRANSPOSE must passed at compile time. |
| 363 | * @note Only the following values for K0, N0 and H0 are supported: |
| 364 | * N0: 2,3,4,8,16 |
| 365 | * K0: 2,3,4,8,16 |
| 366 | * H0: greater than 0 |
| 367 | * |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 368 | * @param[in] src_ptr Pointer to the source tensor. Supported data types: All |
| 369 | * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) |
| 370 | * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) |
| 371 | * @param[in] src_w The size of the width dimension of the source tensor |
| 372 | * @param[in] src_h The size of the height dimension of the source tensor |
| 373 | * @param[in] src_n The size of the depth dimension of the source tensor |
| 374 | * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor |
| 375 | * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All |
| 376 | * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) |
| 377 | * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) |
| 378 | * @param[in] dst_w The size of the width dimension of the destination tensor |
| 379 | * @param[in] dst_h The size of the height dimension of the destination tensor |
| 380 | * @param[in] dst_n The size of the depth dimension of the destination tensor |
| 381 | * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor |
| 382 | * @param[in] H0 The number of blocks to place on the same row. It must be greater than 0. |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 383 | */ |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 384 | __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER), |
| 385 | TENSOR3D_T(dst, BUFFER), |
| 386 | const int H0) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 387 | { |
| 388 | // Block size |
| 389 | #define BLOCK_SIZE ((K0) * (N0)) |
| 390 | |
| 391 | // Output offset X |
| 392 | #if defined(INTERLEAVE) |
| 393 | #define OUTPUT_OFFSET_X (K0) |
| 394 | #else // defined(INTERLEAVE) |
| 395 | #define OUTPUT_OFFSET_X (BLOCK_SIZE) |
| 396 | #endif // defined(INTERLEAVE) |
| 397 | |
| 398 | // Output step X |
| 399 | #if defined(INTERLEAVE) |
| 400 | #define OUTPUT_STEP_X (K0) * (H0) |
| 401 | #else // Do not interleave |
| 402 | #define OUTPUT_STEP_X (K0) |
| 403 | #endif // defined(INTERLEAVE) |
| 404 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 405 | const int x = GET_SPATIAL_IDX(0, 1, 0); |
| 406 | const int y = GET_SPATIAL_IDX(1, 1, 0); |
| 407 | const int z = GET_SPATIAL_IDX(2, 1, 0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 408 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 409 | const int xi = x * N0; |
| 410 | const int yi = y * K0; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 411 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 412 | const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X; |
| 413 | const int yo = (x / H0); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 414 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 415 | src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z; |
| 416 | dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 417 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 418 | TILE(DATA_TYPE, K0, N0, in); |
| 419 | TILE(DATA_TYPE, N0, K0, in_tr); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 420 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 421 | // Initialize the tile to zero |
| 422 | for(int i = 0; i < K0; ++i) |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 423 | { |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 424 | in[i].v = 0; |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 425 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 426 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 427 | // Load input tile |
| 428 | for(int i = 0; i < K0; ++i) |
| 429 | { |
| 430 | if(yi + i < src_h) |
| 431 | { |
| 432 | in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y); |
| 433 | } |
| 434 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 435 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 436 | // Transpose input tile |
| 437 | for(int k0 = 0; k0 < K0; ++k0) |
| 438 | { |
| 439 | for(int n0 = 0; n0 < N0; ++n0) |
| 440 | { |
| 441 | in_tr[n0].s[k0] = in[k0].s[n0]; |
| 442 | } |
| 443 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 444 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 445 | TILE(uint, N0, 1, dst_indirect_y); |
| 446 | for(int i = 0; i < N0; ++i) |
| 447 | { |
| 448 | dst_indirect_y[i].v = i; |
| 449 | } |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 450 | |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 451 | T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y); |
ramelg01 | 9cca592 | 2021-11-11 10:05:00 +0000 | [diff] [blame] | 452 | |
| 453 | #undef BLOCK_SIZE |
| 454 | #undef OUTPUT_OFFSET_X |
| 455 | #undef OUTPUT_STEP_X |
| 456 | } |
Gian Marco Iodice | 4fb5670 | 2021-11-10 11:18:50 +0000 | [diff] [blame] | 457 | |
| 458 | #endif // defined(RESHAPE_RHS_T) |