Anthony Barbier | 6ff3b19 | 2017-09-04 18:44:23 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2017 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #include "helpers.h" |
| 25 | |
| 26 | #define SWAP_ROW(u0, l0) \ |
| 27 | ({ \ |
| 28 | tmp_swap = u0; \ |
| 29 | u0 = l0; \ |
| 30 | l0 = tmp_swap; \ |
| 31 | }) |
| 32 | |
| 33 | #define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \ |
| 34 | ({ \ |
| 35 | VEC_DATA_TYPE(DATA_TYPE, 4) \ |
| 36 | tmp_swap; \ |
| 37 | SWAP_ROW(u0, l0); \ |
| 38 | SWAP_ROW(u1, l1); \ |
| 39 | SWAP_ROW(u2, l2); \ |
| 40 | SWAP_ROW(u3, l3); \ |
| 41 | }) |
| 42 | |
| 43 | #define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \ |
| 44 | ({ \ |
| 45 | VEC_DATA_TYPE(DATA_TYPE, 8) \ |
| 46 | tmp_swap; \ |
| 47 | SWAP_ROW(u0, l0); \ |
| 48 | SWAP_ROW(u1, l1); \ |
| 49 | SWAP_ROW(u2, l2); \ |
| 50 | SWAP_ROW(u3, l3); \ |
| 51 | SWAP_ROW(u4, l4); \ |
| 52 | SWAP_ROW(u5, l5); \ |
| 53 | SWAP_ROW(u6, l6); \ |
| 54 | SWAP_ROW(u7, l7); \ |
| 55 | }) |
| 56 | |
| 57 | #define TRANSPOSE_4x4(u0, u1, u2, u3) \ |
| 58 | ({ \ |
| 59 | VEC_DATA_TYPE(DATA_TYPE, 4) \ |
| 60 | tmp; \ |
| 61 | tmp.s012 = u0.s123; \ |
| 62 | u0.s1 = u1.s0; \ |
| 63 | u0.s2 = u2.s0; \ |
| 64 | u0.s3 = u3.s0; \ |
| 65 | u1.s0 = tmp.s0; \ |
| 66 | u2.s0 = tmp.s1; \ |
| 67 | u3.s0 = tmp.s2; \ |
| 68 | \ |
| 69 | tmp.s01 = u1.s23; \ |
| 70 | u1.s2 = u2.s1; \ |
| 71 | u1.s3 = u3.s1; \ |
| 72 | u2.s1 = tmp.s0; \ |
| 73 | u3.s1 = tmp.s1; \ |
| 74 | \ |
| 75 | tmp.s0 = u2.s3; \ |
| 76 | u2.s3 = u3.s2; \ |
| 77 | u3.s2 = tmp.s0; \ |
| 78 | }) |
| 79 | |
| 80 | #define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7) \ |
| 81 | ({ \ |
| 82 | TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123); \ |
| 83 | TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567); \ |
| 84 | TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123); \ |
| 85 | TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567); \ |
| 86 | SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \ |
| 87 | }) |
| 88 | |
| 89 | #define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15) \ |
| 90 | ({ \ |
| 91 | TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567); \ |
| 92 | TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF); \ |
| 93 | TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \ |
| 94 | TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \ |
| 95 | SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF, \ |
| 96 | u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \ |
| 97 | }) |
| 98 | |
| 99 | #ifndef DATA_TYPE_IN_BYTES |
| 100 | #error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel |
Anthony Barbier | ac69aa1 | 2017-07-03 17:39:37 +0100 | [diff] [blame] | 101 | #endif /* not DATA_TYPE_IN_BYTES */ |
Anthony Barbier | 6ff3b19 | 2017-09-04 18:44:23 +0100 | [diff] [blame] | 102 | |
Moritz Pflanzer | 54f366a | 2017-09-25 15:36:14 +0100 | [diff] [blame] | 103 | #undef VLOAD |
| 104 | #undef VSTORE |
| 105 | |
Anthony Barbier | 6ff3b19 | 2017-09-04 18:44:23 +0100 | [diff] [blame] | 106 | #if DATA_TYPE_IN_BYTES == 4 |
| 107 | #define DATA_TYPE uint |
| 108 | #define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3) |
| 109 | #define VLOAD(x, y) vload4(x, y) |
| 110 | #define VSTORE(x, y, z) vstore4(x, y, z) |
| 111 | #define BLOCK_SIZE 4 |
| 112 | #elif DATA_TYPE_IN_BYTES == 2 |
| 113 | #define DATA_TYPE ushort |
| 114 | #define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7) |
| 115 | #define VLOAD(x, y) vload8(x, y) |
| 116 | #define VSTORE(x, y, z) vstore8(x, y, z) |
| 117 | #define BLOCK_SIZE 8 |
| 118 | #elif DATA_TYPE_IN_BYTES == 1 |
| 119 | #define DATA_TYPE uchar |
| 120 | #define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15) |
| 121 | #define VLOAD(x, y) vload16(x, y) |
| 122 | #define VSTORE(x, y, z) vstore16(x, y, z) |
| 123 | #define BLOCK_SIZE 16 |
Anthony Barbier | ac69aa1 | 2017-07-03 17:39:37 +0100 | [diff] [blame] | 124 | #else /* switch DATA_TYPE_IN_BYTES */ |
Anthony Barbier | 6ff3b19 | 2017-09-04 18:44:23 +0100 | [diff] [blame] | 125 | #error DATA_TYPE_IN_BYTES not supported for transpose |
Anthony Barbier | ac69aa1 | 2017-07-03 17:39:37 +0100 | [diff] [blame] | 126 | #endif /* switch DATA_TYPE_IN_BYTES */ |
Anthony Barbier | 6ff3b19 | 2017-09-04 18:44:23 +0100 | [diff] [blame] | 127 | |
| 128 | /** This OpenCL kernel computes the matrix transposition of input matrix |
| 129 | * |
| 130 | * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be: |
| 131 | * -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices |
| 132 | * -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices |
| 133 | * -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices |
| 134 | * |
| 135 | * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 |
| 136 | * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) |
| 137 | * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) |
| 138 | * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) |
| 139 | * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) |
| 140 | * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix |
| 141 | * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr |
| 142 | * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) |
| 143 | * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) |
| 144 | * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) |
| 145 | * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) |
| 146 | * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix |
| 147 | */ |
| 148 | __kernel void transpose(IMAGE_DECLARATION(src), |
| 149 | IMAGE_DECLARATION(dst)) |
| 150 | { |
| 151 | uint x = get_global_id(0) * BLOCK_SIZE; |
| 152 | uint y = get_global_id(1) * BLOCK_SIZE; |
| 153 | |
| 154 | // Compute source address |
| 155 | Image src = CONVERT_TO_IMAGE_STRUCT(src); |
| 156 | |
| 157 | // Load the NxN block at (x, y) |
| 158 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 159 | u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0))); |
| 160 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 161 | u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1))); |
| 162 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 163 | u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2))); |
| 164 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 165 | u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3))); |
| 166 | #if BLOCK_SIZE > 4 |
| 167 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 168 | u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4))); |
| 169 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 170 | u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5))); |
| 171 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 172 | u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6))); |
| 173 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 174 | u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7))); |
| 175 | #if BLOCK_SIZE == 16 |
| 176 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 177 | u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8))); |
| 178 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 179 | u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9))); |
| 180 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 181 | u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10))); |
| 182 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 183 | u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11))); |
| 184 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 185 | u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12))); |
| 186 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 187 | u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13))); |
| 188 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 189 | u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14))); |
| 190 | VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE) |
| 191 | u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15))); |
| 192 | #endif /* BLOCK_SIZE == 16 */ |
| 193 | #endif /* BLOCK_SIZE > 4 */ |
| 194 | |
| 195 | // Transpose the block |
| 196 | TRANSPOSE(); |
| 197 | |
| 198 | // Store the block at (y, x) |
| 199 | uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes; |
| 200 | VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y)); |
| 201 | VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y)); |
| 202 | VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y)); |
| 203 | VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y)); |
| 204 | #if BLOCK_SIZE > 4 |
| 205 | VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y)); |
| 206 | VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y)); |
| 207 | VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y)); |
| 208 | VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y)); |
| 209 | #if BLOCK_SIZE == 16 |
| 210 | VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y)); |
| 211 | VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y)); |
| 212 | VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y)); |
| 213 | VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y)); |
| 214 | VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y)); |
| 215 | VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y)); |
| 216 | VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y)); |
| 217 | VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y)); |
| 218 | #endif /* BLOCK_SIZE == 16 */ |
| 219 | #endif /* BLOCK_SIZE > 4 */ |
| 220 | } |