blob: cbebc88668ffca023fd536feb1442b999f22decf [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2018 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** Convert an RGB888 image to RGBX8888
27 *
28 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
29 * No offset.
30 *
31 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
33 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
34 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
35 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
36 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
37 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
38 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
39 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
40 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
41 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
42 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
43 */
44__kernel void RGB888_to_RGBA8888_bt709(
45 IMAGE_DECLARATION(input),
46 IMAGE_DECLARATION(output))
47{
48 Image in = CONVERT_TO_IMAGE_STRUCT(input);
49 Image out = CONVERT_TO_IMAGE_STRUCT(output);
50
51 // handle 16 pixels every time
52 uchar16 rgb_0 = vload16(0, in.ptr);
53 uchar16 rgb_1 = vload16(0, in.ptr + 16);
54 uchar16 rgb_2 = vload16(0, in.ptr + 32);
55
56 uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
Gian Marco Iodice040bffe2017-09-29 09:50:28 +010057 uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
59 uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
60
61 vstore16(rgba_0, 0, out.ptr);
62 vstore16(rgba_1, 0, out.ptr + 16);
63 vstore16(rgba_2, 0, out.ptr + 32);
64 vstore16(rgba_3, 0, out.ptr + 48);
65}
66
Manuel Bottiniacaf21d2018-09-26 17:38:19 +010067/** Convert an RGB888 image to U8
68 *
69 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
70 * No offset.
71 *
72 * @param[in] input_ptr Pointer to the source image. Supported Format: RGB888
73 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
74 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
75 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
76 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
77 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
78 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
79 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
80 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
81 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
82 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
83 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
84 */
85__kernel void RGB888_to_U8_bt709(
86 IMAGE_DECLARATION(input),
87 IMAGE_DECLARATION(output))
88{
89 Image in = CONVERT_TO_IMAGE_STRUCT(input);
90 Image out = CONVERT_TO_IMAGE_STRUCT(output);
91
92 // handle 16 pixels every time
93 const uchar16 rgb_0 = vload16(0, in.ptr);
94 const uchar16 rgb_1 = vload16(0, in.ptr + 16);
95 const uchar16 rgb_2 = vload16(0, in.ptr + 32);
96
97 //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
98 const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
99 const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
100 const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
101
102 const float16 rgb2u8_red_coef_bt709 = 0.2126f;
103 const float16 rgb2u8_green_coef_bt709 = 0.7152f;
104 const float16 rgb2u8_blue_coef_bt709 = 0.0722f;
105
106 //Computation of 16 greyscale values in float
107 const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
108
109 //Convert it to 16 grayscale uchar values
110 const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
111
112 vstore16(greyscale_u8_0, 0, out.ptr);
113}
114
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100115/** Convert an RGB888 image to RGBX8888
116 *
117 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
118 * No offset.
119 *
120 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
121 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
122 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
123 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
124 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
125 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
126 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
127 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
128 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
129 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
130 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
131 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
132 */
133__kernel void RGBA8888_to_RGB888_bt709(
134 IMAGE_DECLARATION(input),
135 IMAGE_DECLARATION(output))
136{
137 Image in = CONVERT_TO_IMAGE_STRUCT(input);
138 Image out = CONVERT_TO_IMAGE_STRUCT(output);
139 // handle 16 pixels every time
140 uchar16 rgba_0 = vload16(0, in.ptr);
141 uchar16 rgba_1 = vload16(0, in.ptr + 16);
142 uchar16 rgba_2 = vload16(0, in.ptr + 32);
143 uchar16 rgba_3 = vload16(0, in.ptr + 48);
144
145 uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
146 uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
147 uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
148
149 vstore16(rgb_0, 0, out.ptr);
150 vstore16(rgb_1, 0, out.ptr + 16);
151 vstore16(rgb_2, 0, out.ptr + 32);
152}
153
154/** Convert a UYVY422 image to RGB888 using BT709 color space
155 *
156 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
157 * No offset.
158 *
159 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
160 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
161 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
162 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
163 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
164 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
165 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
166 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
167 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
169 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
171 */
172__kernel void UYVY422_to_RGB888_bt709(
173 IMAGE_DECLARATION(input),
174 IMAGE_DECLARATION(output))
175{
176 Image in = CONVERT_TO_IMAGE_STRUCT(input);
177 Image out = CONVERT_TO_IMAGE_STRUCT(output);
178
179 // handle 8 pixels every time
180 uchar16 uyvy = vload16(0, in.ptr);
181
182 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
183 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
184 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
185
Pablo Tellod2d5f752018-06-05 13:37:36 +0100186 float8 red_coef_bt709 = (float8)(1.5748f);
187 float8 green_coef_bt709 = (float8)(-0.1873f);
188 float8 green_coef2_bt709 = (float8)(-0.4681f);
189 float8 blue_coef_bt709 = (float8)(1.8556f);
190 float8 lumav = convert_float8(luma);
191
192 float8 f_r = red_coef_bt709 * convert_float8(cr);
193 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
194 float8 f_b = blue_coef_bt709 * convert_float8(cb);
195
196 f_r += lumav;
197 f_g += lumav;
198 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100199
Pablo Tello96fc1d62018-07-17 17:10:59 +0100200 uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
201 uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
202 uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203
204 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
205 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
206 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
207
208 vstore16(rgb_0, 0, out.ptr);
209 vstore8(rgb_1, 0, out.ptr + 16);
210}
211
212/** Convert a UYVY422 image to RGBX8888 using BT709 color space
213 *
214 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
215 * No offset.
216 *
217 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
218 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
219 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
220 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
221 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
222 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
223 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
224 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
225 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
226 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
227 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
228 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
229 */
230__kernel void UYVY422_to_RGBA8888_bt709(
231 IMAGE_DECLARATION(input),
232 IMAGE_DECLARATION(output))
233{
234 Image in = CONVERT_TO_IMAGE_STRUCT(input);
235 Image out = CONVERT_TO_IMAGE_STRUCT(output);
236
237 // handle 8 pixels every time
238 uchar16 uyvy = vload16(0, in.ptr);
239
240 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
241 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
242 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
243
Pablo Tellod2d5f752018-06-05 13:37:36 +0100244 float8 red_coef_bt709 = (float8)(1.5748f);
245 float8 green_coef_bt709 = (float8)(-0.1873f);
246 float8 green_coef2_bt709 = (float8)(-0.4681f);
247 float8 blue_coef_bt709 = (float8)(1.8556f);
248 float8 lumav = convert_float8(luma);
249
250 float8 f_r = red_coef_bt709 * convert_float8(cr);
251 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
252 float8 f_b = blue_coef_bt709 * convert_float8(cb);
253
254 f_r += lumav;
255 f_g += lumav;
256 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100257
Pablo Tello96fc1d62018-07-17 17:10:59 +0100258 uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
259 uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
260 uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100261
262 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
263 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
264 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
265 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
266
267 vstore16(rgba_0, 0, out.ptr);
268 vstore16(rgba_1, 0, out.ptr + 16);
269}
270
271/** Convert a YUYV422 image to RGB888 using BT709 color space
272 *
273 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
274 * No offset.
275 *
276 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
277 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
278 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
279 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
280 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
281 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
282 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
283 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
284 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
285 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
286 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
287 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
288 */
289__kernel void YUYV422_to_RGB888_bt709(
290 IMAGE_DECLARATION(input),
291 IMAGE_DECLARATION(output))
292{
293 Image in = CONVERT_TO_IMAGE_STRUCT(input);
294 Image out = CONVERT_TO_IMAGE_STRUCT(output);
295
296 // handle 8 pixels every time
297 uchar16 uyvy = vload16(0, in.ptr);
298
299 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
300 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
301 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
302
Pablo Tellod2d5f752018-06-05 13:37:36 +0100303 float8 red_coef_bt709 = (float8)(1.5748f);
304 float8 green_coef_bt709 = (float8)(-0.1873f);
305 float8 green_coef2_bt709 = (float8)(-0.4681f);
306 float8 blue_coef_bt709 = (float8)(1.8556f);
307 float8 lumav = convert_float8(luma);
308
309 float8 f_r = red_coef_bt709 * convert_float8(cr);
310 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
311 float8 f_b = blue_coef_bt709 * convert_float8(cb);
312
313 f_r += lumav;
314 f_g += lumav;
315 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100316
Pablo Tello96fc1d62018-07-17 17:10:59 +0100317 uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
318 uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
319 uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100320
321 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
322 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
323 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
324
325 vstore16(rgb_0, 0, out.ptr);
326 vstore8(rgb_1, 0, out.ptr + 16);
327}
328
329/** Convert a YUYV422 image to RGBX8888 using BT709 color space
330 *
331 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
332 * No offset.
333 *
334 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
335 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
336 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
337 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
338 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
339 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
340 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
341 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
342 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
343 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
344 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
345 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
346 */
347__kernel void YUYV422_to_RGBA8888_bt709(
348 IMAGE_DECLARATION(input),
349 IMAGE_DECLARATION(output))
350{
351 Image in = CONVERT_TO_IMAGE_STRUCT(input);
352 Image out = CONVERT_TO_IMAGE_STRUCT(output);
353
354 // handle 8 pixels every time
355 uchar16 uyvy = vload16(0, in.ptr);
356
357 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
358 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
359 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
360
Pablo Tellod2d5f752018-06-05 13:37:36 +0100361 float8 red_coef_bt709 = (float8)(1.5748f);
362 float8 green_coef_bt709 = (float8)(-0.1873f);
363 float8 green_coef2_bt709 = (float8)(-0.4681f);
364 float8 blue_coef_bt709 = (float8)(1.8556f);
365 float8 lumav = convert_float8(luma);
366
367 float8 f_r = red_coef_bt709 * convert_float8(cr);
368 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
369 float8 f_b = blue_coef_bt709 * convert_float8(cb);
370
371 f_r += lumav;
372 f_g += lumav;
373 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100374
Pablo Tello96fc1d62018-07-17 17:10:59 +0100375 uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
376 uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
377 uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100378
379 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
380 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
381 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
382 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
383
384 vstore16(rgba_0, 0, out.ptr);
385 vstore16(rgba_1, 0, out.ptr + 16);
386}
387
388/** Convert a RGB image to NV12 using BT709 color space
389 *
390 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
391 * No offset.
392 *
393 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
394 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
395 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
396 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
397 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
398 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
399 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
400 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
401 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
402 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
403 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
404 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
405 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
406 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
407 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
408 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
409 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
410 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
411 *
412 */
413__kernel void RGB888_to_NV12_bt709(
414 IMAGE_DECLARATION(input),
415 IMAGE_DECLARATION(luma),
416 IMAGE_DECLARATION(uv))
417{
418 Image in = CONVERT_TO_IMAGE_STRUCT(input);
419 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
420 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
421
422 // handle 4 pixels every time, two lines, each line for 2 pixels
423 // Read 2 pixel of the first line
424 uchar8 rgb_0 = vload8(0, in.ptr);
425 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
426 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
427 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
428
429 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
430 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
431 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
432
433 short2 i_y = convert_short2_rtz(f_y);
434 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
435 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
436
437 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
438 vstore2(luma_0, 0, out_y.ptr);
439
440 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
441 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
442
443 // Read 2 pixel of the second line
444 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
445 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
446 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
447 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
448
449 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
450 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
451 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
452
453 i_y = convert_short2_rtz(f_y);
454 i_u = convert_short2_rtz(f_u) + (short2)(128);
455 i_v = convert_short2_rtz(f_v) + (short2)(128);
456
457 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
458 vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
459
460 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
461 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
462 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
463 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
464
465 vstore2(cbcr, 0, out_uv.ptr);
466}
467
468/*
469 R'= Y' + 0.0000*U + 1.5748*V
470 G'= Y' - 0.1873*U - 0.4681*V
471 B'= Y' + 1.8556*U + 0.0000*V
472*/
473
474/** Convert an NV12 image to RGB888
475 *
476 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
477 * No offset.
478 *
479 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
480 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
481 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
482 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
483 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
484 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
485 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
486 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
487 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
488 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
489 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
490 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
491 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
492 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
493 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
494 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
495 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
496 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
497 */
498__kernel void NV12_to_RGB888_bt709(
499 IMAGE_DECLARATION(luma_input),
500 IMAGE_DECLARATION(uv_input),
501 IMAGE_DECLARATION(rgb_output))
502{
503 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
504 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
505 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
506
507 // handle 8 pixels every time, two lines, each line for 4 pixels
508 uchar4 luma_0 = vload4(0, in_luma.ptr);
509 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
510 uchar4 cbcr = vload4(0, in_uv.ptr);
511 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
512 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
513
514 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
515 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
516 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
517
518 float4 f_r = convert_float4(luma_0) + temp0;
519 float4 f_g = convert_float4(luma_0) + temp1;
520 float4 f_b = convert_float4(luma_0) + temp2;
521
Pablo Tello96fc1d62018-07-17 17:10:59 +0100522 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
523 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
524 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100525
526 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
527 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
528 vstore8(rgb_0, 0, out_rgb.ptr);
529 vstore4(rgb_1, 0, out_rgb.ptr + 8);
530
531 f_r = convert_float4(luma_1) + temp0;
532 f_g = convert_float4(luma_1) + temp1;
533 f_b = convert_float4(luma_1) + temp2;
534
Pablo Tello96fc1d62018-07-17 17:10:59 +0100535 r_0 = convert_uchar4_sat_rtz(f_r);
536 g_0 = convert_uchar4_sat_rtz(f_g);
537 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100538
539 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
540 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
541 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
542 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
543}
544
545/** Convert a RGB image to YUV444 using BT709 color space
546 *
547 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
548 * No offset.
549 *
550 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
551 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
552 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
553 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
554 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
555 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
556 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
557 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
558 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
559 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
560 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
561 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
562 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
563 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
564 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
565 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
566 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
567 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
568 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
569 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
570 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
571 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
572 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
573 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
574 *
575 */
576__kernel void RGB888_to_YUV444_bt709(
577 IMAGE_DECLARATION(rgb_input),
578 IMAGE_DECLARATION(luma_output),
579 IMAGE_DECLARATION(u_output),
580 IMAGE_DECLARATION(v_output))
581{
582 // handle 4 pixels every time
583 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
584 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
585 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
586 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
587
588 // Read 4 pixel
589 uchar16 rgb_0 = vload16(0, in_rgb.ptr);
590 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
591 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
592 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
593
594 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
595 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
596 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
597
598 short4 i_y = convert_short4_rtz(f_y);
599 short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
600 short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
601
602 uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
603 vstore4(luma_0, 0, out_y.ptr);
604
605 uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
606 uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
607 vstore4(cb_0, 0, out_u.ptr);
608 vstore4(cr_0, 0, out_v.ptr);
609}
610
611/** Convert a RGB image to IYUV using BT709 color space
612 *
613 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
614 * No offset.
615 *
616 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
617 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
618 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
619 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
620 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
621 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
622 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
623 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
624 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
625 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
626 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
627 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
628 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
629 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
630 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
631 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
632 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
633 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
634 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
635 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
636 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
637 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
638 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
639 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
640 *
641 */
642__kernel void RGB888_to_IYUV_bt709(
643 IMAGE_DECLARATION(rgb_input),
644 IMAGE_DECLARATION(luma_output),
645 IMAGE_DECLARATION(u_output),
646 IMAGE_DECLARATION(v_output))
647{
648 // handle 4 pixels every time, two lines, each line for 2 pixels
649 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
650 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
651 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
652 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
653
654 // Read 2 pixel of the first line
655 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
656 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
657 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
658 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
659
660 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
661 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
662 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
663
664 short2 i_y = convert_short2_rtz(f_y);
665 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
666 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
667
668 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
669 vstore2(luma_0, 0, out_y.ptr);
670
671 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
672 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
673
674 // Read 2 pixel of the second line
675 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
676 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
677 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
678 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
679
680 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
681 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
682 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
683
684 i_y = convert_short2_rtz(f_y);
685 i_u = convert_short2_rtz(f_u) + (short2)(128);
686 i_v = convert_short2_rtz(f_v) + (short2)(128);
687
688 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
689 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
690
691 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
692 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
693 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
694 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
695 *out_u.ptr = cbcr.x;
696 *out_v.ptr = cbcr.y;
697}
698
699/** Convert a RGBA image to YUV444 using BT709 color space
700 *
701 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
702 * No offset.
703 *
704 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
705 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
706 * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
707 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
708 * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
709 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
710 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
711 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
712 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
713 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
714 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
715 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
716 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
717 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
718 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
719 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
720 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
721 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
722 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
723 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
724 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
725 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
726 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
727 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
728 *
729 */
730__kernel void RGBA8888_to_YUV444_bt709(
731 IMAGE_DECLARATION(rgba_input),
732 IMAGE_DECLARATION(luma_output),
733 IMAGE_DECLARATION(u_output),
734 IMAGE_DECLARATION(v_output))
735{
736 // handle 4 pixels every time
737 Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
738 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
739 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
740 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
741
742 // Read 4 pixel
743 uchar16 rgb_0 = vload16(0, in_rgba.ptr);
744 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
745 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
746 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
747
748 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
749 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
750 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
751
752 short4 i_y = convert_short4(f_y);
753 short4 i_u = convert_short4(f_u) + (short4)(128);
754 short4 i_v = convert_short4(f_v) + (short4)(128);
755
756 uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
757 vstore4(luma_0, 0, out_y.ptr);
758
759 uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
760 uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
761 vstore4(cb_0, 0, out_u.ptr);
762 vstore4(cr_0, 0, out_v.ptr);
763}
764
765/** Convert a RGBA image to NV12 using BT709 color space
766 *
767 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
768 * No offset.
769 *
770 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
771 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
772 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
773 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
774 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
775 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
776 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
777 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
778 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
779 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
780 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
781 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
782 * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8
783 * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes)
784 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
785 * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes)
786 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
787 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
788 *
789 */
790__kernel void RGBA8888_to_NV12_bt709(
791 IMAGE_DECLARATION(input),
792 IMAGE_DECLARATION(luma_output),
793 IMAGE_DECLARATION(uv_output))
794{
795 Image in = CONVERT_TO_IMAGE_STRUCT(input);
796 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
797 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
798
799 // Read 2 pixel of the first line
800 uchar8 rgb_0 = vload8(0, in.ptr);
801 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
802 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
803 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
804
805 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
806 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
807 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
808
809 short2 i_y = convert_short2_rtz(f_y);
810 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
811 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
812
813 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
814 vstore2(luma_0, 0, out_y.ptr);
815
816 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
817 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
818
819 // Read 2 pixel of the second line
820 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
821 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
822 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
823 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
824
825 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
826 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
827 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
828
829 i_y = convert_short2_rtz(f_y);
830 i_u = convert_short2_rtz(f_u) + (short2)(128);
831 i_v = convert_short2_rtz(f_v) + (short2)(128);
832
833 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
834 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
835
836 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
837 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
838 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
839 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
840 vstore2(cbcr, 0, out_uv.ptr);
841}
842
843/** Convert a RGBA image to IYUV using BT709 color space
844 *
845 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
846 * No offset.
847 *
848 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
849 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
850 * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
851 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
852 * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
853 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
854 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
855 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
856 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
857 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
858 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
859 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
860 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
861 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
862 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
863 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
864 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
865 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
866 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
867 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
868 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
869 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
870 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
871 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
872 *
873 */
874__kernel void RGBA8888_to_IYUV_bt709(
875 IMAGE_DECLARATION(rgba_input),
876 IMAGE_DECLARATION(luma_output),
877 IMAGE_DECLARATION(u_output),
878 IMAGE_DECLARATION(v_output))
879{
880 // handle 4 pixels every time, two lines, each line for 2 pixels
881 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
882 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
883 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
884 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
885
886 // Read 2 pixel of the first line
887 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
888 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
889 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
890 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
891
892 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
893 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
894 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
895
896 short2 i_y = convert_short2_rtz(f_y);
897 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
898 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
899
900 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
901 vstore2(luma_0, 0, out_y.ptr);
902
903 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
904 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
905
906 // Read 2 pixel of the second line
907 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
908 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
909 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
910 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
911
912 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
913 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
914 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
915
916 i_y = convert_short2_rtz(f_y);
917 i_u = convert_short2_rtz(f_u) + (short2)(128);
918 i_v = convert_short2_rtz(f_v) + (short2)(128);
919
920 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
921 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
922
923 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
924 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
925 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
926 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
927 *out_u.ptr = cbcr.x;
928 *out_v.ptr = cbcr.y;
929}
930
931/** Convert an NV12 image to RGB8888
932 *
933 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
934 * No offset.
935 *
936 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
937 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
938 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
939 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
940 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
941 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
942 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
943 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
944 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
945 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
946 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
947 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
948 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
949 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
950 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
951 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
952 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
953 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
954 */
955__kernel void NV12_to_RGBA8888_bt709(
956 IMAGE_DECLARATION(luma_input),
957 IMAGE_DECLARATION(uv_input),
958 IMAGE_DECLARATION(rgb_output))
959{
960 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
961 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
962 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
963
964 uchar4 luma_0 = vload4(0, in_luma.ptr);
965 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
966 uchar4 cbcr = vload4(0, in_uv.ptr);
967 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
968 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
969
970 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
971 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
972 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
973
974 float4 f_r = convert_float4(luma_0) + temp0;
975 float4 f_g = convert_float4(luma_0) + temp1;
976 float4 f_b = convert_float4(luma_0) + temp2;
977
Pablo Tello96fc1d62018-07-17 17:10:59 +0100978 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
979 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
980 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100981
982 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
983 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
984 vstore8(rgb_0, 0, out_rgb.ptr);
985 vstore8(rgb_1, 0, out_rgb.ptr + 8);
986
987 f_r = convert_float4(luma_1) + temp0;
988 f_g = convert_float4(luma_1) + temp1;
989 f_b = convert_float4(luma_1) + temp2;
990
Pablo Tello96fc1d62018-07-17 17:10:59 +0100991 r_0 = convert_uchar4_sat_rtz(f_r);
992 g_0 = convert_uchar4_sat_rtz(f_g);
993 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100994
995 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
996 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
997 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
998 vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
999}
1000
1001/** Convert an NV12 image to IYUV
1002 *
1003 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1004 * No offset.
1005 *
1006 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1007 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1008 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1009 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1010 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1011 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1012 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1013 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1014 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1015 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1016 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1017 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1018 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1019 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1020 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1021 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1022 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1023 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1024 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1025 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1026 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1027 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1028 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1029 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1030 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1031 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1032 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1033 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1034 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1035 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1036 */
1037__kernel void NV12_to_IYUV_bt709(
1038 IMAGE_DECLARATION(luma_input),
1039 IMAGE_DECLARATION(uv_input),
1040 IMAGE_DECLARATION(luma_output),
1041 IMAGE_DECLARATION(u_output),
1042 IMAGE_DECLARATION(v_output))
1043{
1044 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1045 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1046 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1047 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1048 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1049
1050 // handle 32 pixels every time, two lines, each line for 16 pixels
1051 uchar16 luma_0 = vload16(0, in_y.ptr);
1052 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1053 uchar16 cbcr = vload16(0, in_uv.ptr);
1054 uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
1055 uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
1056
1057 vstore16(luma_0, 0, out_y.ptr);
1058 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1059 vstore8(cb, 0, out_u.ptr);
1060 vstore8(cr, 0, out_v.ptr);
1061}
1062
1063/** Convert an NV12 image to YUV444
1064 *
1065 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1066 * No offset.
1067 *
1068 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1069 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1070 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1071 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1072 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1073 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1074 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1075 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1076 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1077 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1078 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1079 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1080 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1081 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1082 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1083 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1084 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1085 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1086 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1087 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1088 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1089 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1090 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1091 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1092 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1093 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1094 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1095 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1096 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1097 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1098 */
1099__kernel void NV12_to_YUV444_bt709(
1100 IMAGE_DECLARATION(luma_input),
1101 IMAGE_DECLARATION(uv_input),
1102 IMAGE_DECLARATION(luma_output),
1103 IMAGE_DECLARATION(u_output),
1104 IMAGE_DECLARATION(v_output))
1105{
1106 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1107 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1108 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1109 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1110 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1111
1112 // handle 32 pixels every time, two lines, each line for 16 pixels
1113 uchar16 luma_0 = vload16(0, in_y.ptr);
1114 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1115 uchar16 cbcr = vload16(0, in_uv.ptr);
1116 uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1117 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1118 uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1119 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1120
1121 vstore16(luma_0, 0, out_y.ptr);
1122 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1123 vstore16(cb, 0, out_u.ptr);
1124 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1125 vstore16(cr, 0, out_v.ptr);
1126 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1127}
1128
1129/** Convert an NV21 image to RGB888
1130 *
1131 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1132 * No offset.
1133 *
1134 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1135 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1136 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1137 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1138 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1139 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1140 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1141 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1142 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1143 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1144 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1145 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1146 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1147 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1148 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1149 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1150 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
1151 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1152 */
1153__kernel void NV21_to_RGB888_bt709(
1154 IMAGE_DECLARATION(luma_input),
1155 IMAGE_DECLARATION(uv_input),
1156 IMAGE_DECLARATION(rgb_output))
1157{
1158 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1159 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1160 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1161
1162 // handle 8 pixels every time, two lines, each line for 4 pixels
1163 uchar4 luma_0 = vload4(0, in_y.ptr);
1164 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1165 uchar4 cbcr = vload4(0, in_uv.ptr);
1166 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1167 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1168
1169 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1170 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1171 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1172
1173 float4 f_r = convert_float4(luma_0) + temp0;
1174 float4 f_g = convert_float4(luma_0) + temp1;
1175 float4 f_b = convert_float4(luma_0) + temp2;
1176
Pablo Tello96fc1d62018-07-17 17:10:59 +01001177 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
1178 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
1179 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001180
1181 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1182 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1183 vstore8(rgb_0, 0, out_rgb.ptr);
1184 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1185
1186 f_r = convert_float4(luma_1) + temp0;
1187 f_g = convert_float4(luma_1) + temp1;
1188 f_b = convert_float4(luma_1) + temp2;
1189
Pablo Tello96fc1d62018-07-17 17:10:59 +01001190 r_0 = convert_uchar4_sat_rtz(f_r);
1191 g_0 = convert_uchar4_sat_rtz(f_g);
1192 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001193
1194 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1195 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1196 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1197 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1198}
1199
1200/** Convert an NV12 image to RGB8888
1201 *
1202 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1203 * No offset.
1204 *
1205 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1206 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1207 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1208 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1209 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1210 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1211 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1212 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1213 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1214 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1215 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1216 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1217 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1218 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1219 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1220 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1221 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1222 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1223 */
1224__kernel void NV21_to_RGBA8888_bt709(
1225 IMAGE_DECLARATION(luma_input),
1226 IMAGE_DECLARATION(uv_input),
1227 IMAGE_DECLARATION(rgba_output))
1228{
1229 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
1230 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1231 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1232
1233 // handle 8 pixels every time, two lines, each line for 4 pixels
1234 uchar4 luma_0 = vload4(0, in_luma.ptr);
1235 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
1236 uchar4 cbcr = vload4(0, in_uv.ptr);
1237 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1238 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1239
1240 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1241 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1242 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1243
1244 float4 f_r = convert_float4(luma_0) + temp0;
1245 float4 f_g = convert_float4(luma_0) + temp1;
1246 float4 f_b = convert_float4(luma_0) + temp2;
1247
Pablo Tello96fc1d62018-07-17 17:10:59 +01001248 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
1249 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
1250 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001251
1252 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1253 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1254 vstore8(rgb_0, 0, out_rgb.ptr);
1255 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1256
1257 f_r = convert_float4(luma_1) + temp0;
1258 f_g = convert_float4(luma_1) + temp1;
1259 f_b = convert_float4(luma_1) + temp2;
1260
Pablo Tello96fc1d62018-07-17 17:10:59 +01001261 r_0 = convert_uchar4_sat_rtz(f_r);
1262 g_0 = convert_uchar4_sat_rtz(f_g);
1263 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001264
1265 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1266 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1267 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1268 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1269}
1270
1271/** Convert an NV21 image to YUV444
1272 *
1273 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1274 * No offset.
1275 *
1276 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1277 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1278 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1279 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1280 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1281 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1282 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1283 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1284 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1285 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1286 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1287 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1288 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1289 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1290 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1291 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1292 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1293 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1294 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1295 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1296 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1297 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1298 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1299 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1300 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1301 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1302 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1303 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1304 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1305 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1306 */
1307__kernel void NV21_to_YUV444_bt709(
1308 IMAGE_DECLARATION(luma_input),
1309 IMAGE_DECLARATION(uv_input),
1310 IMAGE_DECLARATION(luma_output),
1311 IMAGE_DECLARATION(u_output),
1312 IMAGE_DECLARATION(v_output))
1313{
1314 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1315 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1316 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1317 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1318 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1319
1320 // handle 32 pixels every time, two lines, each line for 16 pixels
1321 uchar16 luma_0 = vload16(0, in_y.ptr);
1322 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1323 uchar16 cbcr = vload16(0, in_uv.ptr);
1324 uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1325 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1326 uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1327 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1328
1329 vstore16(luma_0, 0, out_y.ptr);
1330 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1331 vstore16(cb, 0, out_u.ptr);
1332 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1333 vstore16(cr, 0, out_v.ptr);
1334 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1335}
1336
1337/** Convert an NV21 image to IYUV
1338 *
1339 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1340 * No offset.
1341 *
1342 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1343 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1344 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1345 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1346 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1347 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1348 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1349 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1350 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1351 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1352 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1353 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1354 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1355 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1356 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1357 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1358 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1359 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1360 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1361 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1362 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1363 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1364 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1365 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1366 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1367 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1368 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1369 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1370 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1371 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1372 */
1373__kernel void NV21_to_IYUV_bt709(
1374 IMAGE_DECLARATION(luma_input),
1375 IMAGE_DECLARATION(uv_input),
1376 IMAGE_DECLARATION(luma_output),
1377 IMAGE_DECLARATION(u_output),
1378 IMAGE_DECLARATION(v_output))
1379{
1380 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1381 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1382 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1383 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1384 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1385
1386 uchar16 luma_0 = vload16(0, in_y.ptr);
1387 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1388 uchar16 cbcr = vload16(0, in_uv.ptr);
1389 uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
1390 uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
1391
1392 vstore16(luma_0, 0, out_y.ptr);
1393 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1394 vstore8(cb, 0, out_u.ptr);
1395 vstore8(cr, 0, out_v.ptr);
1396}
1397
1398/** Convert a UYVY image to IYUV using BT709 color space
1399 *
1400 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1401 * No offset.
1402 *
1403 * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8
1404 * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes)
1405 * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
1406 * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes)
1407 * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
1408 * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image
1409 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1410 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1411 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1412 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1413 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1414 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1415 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1416 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1417 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1418 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1419 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1420 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1421 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1422 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1423 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1424 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1425 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1426 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1427 *
1428 */
1429__kernel void UYVY422_to_IYUV_bt709(
1430 IMAGE_DECLARATION(uyvy_input),
1431 IMAGE_DECLARATION(luma_output),
1432 IMAGE_DECLARATION(u_output),
1433 IMAGE_DECLARATION(v_output))
1434{
1435 Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
1436 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1437 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1438 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1439
1440 // handle 16 pixels every time, each line 8 pixels
1441 uchar16 uyvy = vload16(0, in_uyvy.ptr);
1442 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1443 ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1444 ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1445 vstore8(luma, 0, out_y.ptr);
1446
1447 uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
1448 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1449 ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1450 ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1451 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1452
1453 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1454 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1455 vstore4(cb, 0, out_u.ptr);
1456 vstore4(cr, 0, out_v.ptr);
1457}
1458
1459/** Convert a YUYV image to IYUV using BT709 color space
1460 *
1461 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1462 * No offset.
1463 *
1464 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1465 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1466 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1467 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1468 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1469 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1470 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1471 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1472 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1473 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1474 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1475 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1476 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1477 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1478 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1479 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1480 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1481 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1482 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1483 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1484 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1485 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1486 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1487 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1488 *
1489 */
1490__kernel void YUYV422_to_IYUV_bt709(
1491 IMAGE_DECLARATION(yuyv_input),
1492 IMAGE_DECLARATION(luma_output),
1493 IMAGE_DECLARATION(u_output),
1494 IMAGE_DECLARATION(v_output))
1495{
1496 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1497 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1498 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1499 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1500
1501 // handle 16 pixels every time, each line 8 pixels
1502 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1503 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1504 ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1505 ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1506 vstore8(luma, 0, out_y.ptr);
1507
1508 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1509 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1510 ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1511 ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1512 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1513
1514 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1515 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1516 vstore4(cb, 0, out_u.ptr);
1517 vstore4(cr, 0, out_v.ptr);
1518}
1519
1520/** Convert an IYUV image to RGB888
1521 *
1522 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1523 * No offset.
1524 *
1525 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1526 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1527 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1528 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1529 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1530 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1531 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1532 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1533 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1534 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1535 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1536 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1537 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1538 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1539 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1540 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1541 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1542 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1543 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1544 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1545 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1546 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1547 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
1548 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1549 */
1550__kernel void IYUV_to_RGB888_bt709(
1551 IMAGE_DECLARATION(luma_input),
1552 IMAGE_DECLARATION(u_input),
1553 IMAGE_DECLARATION(v_input),
1554 IMAGE_DECLARATION(rgb_output))
1555{
1556 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1557 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1558 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1559 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1560
1561 // handle 8 pixels every time, two lines, each line for 4 pixels
1562 uchar4 luma_0 = vload4(0, in_y.ptr);
1563 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1564 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1565 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1566 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1567
1568 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1569 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1570 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1571
1572 float4 f_r = convert_float4(luma_0) + temp0;
1573 float4 f_g = convert_float4(luma_0) + temp1;
1574 float4 f_b = convert_float4(luma_0) + temp2;
1575
Pablo Tello96fc1d62018-07-17 17:10:59 +01001576 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
1577 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
1578 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001579
1580 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1581 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1582 vstore8(rgb_0, 0, out_rgb.ptr);
1583 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1584
1585 f_r = convert_float4(luma_1) + temp0;
1586 f_g = convert_float4(luma_1) + temp1;
1587 f_b = convert_float4(luma_1) + temp2;
1588
Pablo Tello96fc1d62018-07-17 17:10:59 +01001589 r_0 = convert_uchar4_sat_rtz(f_r);
1590 g_0 = convert_uchar4_sat_rtz(f_g);
1591 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001592
1593 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1594 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1595 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1596 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1597}
1598
1599/** Convert an IYUV image to RGB8888
1600 *
1601 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1602 * No offset.
1603 *
1604 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1605 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1606 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1607 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1608 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1609 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1610 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1611 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1612 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1613 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1614 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1615 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1616 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1617 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1618 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1619 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1620 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1621 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1622 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1623 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1624 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1625 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1626 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1627 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1628 */
1629__kernel void IYUV_to_RGBA8888_bt709(
1630 IMAGE_DECLARATION(luma_input),
1631 IMAGE_DECLARATION(u_input),
1632 IMAGE_DECLARATION(v_input),
1633 IMAGE_DECLARATION(rgba_output))
1634{
1635 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1636 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1637 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1638 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1639
1640 // handle 8 pixels every time, two lines, each line for 4 pixels
1641 uchar4 luma_0 = vload4(0, in_y.ptr);
1642 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1643 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1644 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1645 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1646
1647 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1648 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1649 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1650
1651 float4 f_r = convert_float4(luma_0) + temp0;
1652 float4 f_g = convert_float4(luma_0) + temp1;
1653 float4 f_b = convert_float4(luma_0) + temp2;
1654
Pablo Tello96fc1d62018-07-17 17:10:59 +01001655 uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
1656 uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
1657 uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001658
1659 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1660 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1661 vstore8(rgb_0, 0, out_rgb.ptr);
1662 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1663
1664 f_r = convert_float4(luma_1) + temp0;
1665 f_g = convert_float4(luma_1) + temp1;
1666 f_b = convert_float4(luma_1) + temp2;
1667
Pablo Tello96fc1d62018-07-17 17:10:59 +01001668 r_0 = convert_uchar4_sat_rtz(f_r);
1669 g_0 = convert_uchar4_sat_rtz(f_g);
1670 b_0 = convert_uchar4_sat_rtz(f_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001671
1672 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1673 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1674 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1675 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1676}
1677
1678/** Convert an IYUV image to YUV444
1679 *
1680 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1681 * No offset.
1682 *
1683 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1684 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1685 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1686 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1687 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1688 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1689 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1690 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1691 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1692 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1693 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1694 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1695 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1696 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1697 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1698 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1699 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1700 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1701 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1702 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1703 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1704 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1705 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1706 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1707 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1708 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1709 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1710 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1711 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1712 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1713 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1714 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1715 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1716 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1717 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1718 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1719 *
1720 */
1721__kernel void IYUV_to_YUV444_bt709(
1722 IMAGE_DECLARATION(luma_input),
1723 IMAGE_DECLARATION(u_input),
1724 IMAGE_DECLARATION(v_input),
1725 IMAGE_DECLARATION(luma_output),
1726 IMAGE_DECLARATION(u_output),
1727 IMAGE_DECLARATION(v_output))
1728{
1729 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1730 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1731 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1732 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1733 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1734 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1735
1736 // handle 32 pixels every time, two lines, each line for 16 pixels
1737 uchar16 luma_0 = vload16(0, in_y.ptr);
1738 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1739 uchar8 cb_src = vload8(0, in_u.ptr);
1740 uchar8 cr_src = vload8(0, in_v.ptr);
1741 uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
1742 cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
1743 uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
1744 cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
1745
1746 vstore16(luma_0, 0, out_y.ptr);
1747 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1748 vstore16(cb, 0, out_u.ptr);
1749 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1750 vstore16(cr, 0, out_v.ptr);
1751 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1752}
1753
1754/** Convert an IYUV image to NV12
1755 *
1756 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1757 * No offset.
1758 *
1759 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1760 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1761 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1762 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1763 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1764 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1765 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1766 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1767 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1768 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1769 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1770 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1771 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1772 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1773 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1774 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1775 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1776 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1777 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1778 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1779 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1780 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1781 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1782 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1783 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1784 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1785 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1786 * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1787 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1788 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1789 *
1790 */
1791__kernel void IYUV_to_NV12_bt709(
1792 IMAGE_DECLARATION(luma_input),
1793 IMAGE_DECLARATION(u_input),
1794 IMAGE_DECLARATION(v_input),
1795 IMAGE_DECLARATION(luma_output),
1796 IMAGE_DECLARATION(uv_output))
1797{
1798 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1799 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1800 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1801 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1802 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1803
1804 // handle 32 pixels every time, two lines, each line for 16 pixels
1805 uchar16 luma_0 = vload16(0, in_y.ptr);
1806 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1807 uchar8 cb = vload8(0, in_u.ptr);
1808 uchar8 cr = vload8(0, in_v.ptr);
1809 uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
1810 cr.s6, cb.s7, cr.s7);
1811
1812 vstore16(luma_0, 0, out_y.ptr);
1813 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1814 vstore16(cbcr, 0, out_uv.ptr);
1815}
1816
1817/** Convert a YUYV image to NV12 using BT709 color space
1818 *
1819 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1820 * No offset.
1821 *
1822 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1823 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1824 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1825 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1826 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1827 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1828 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1829 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1830 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1831 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1832 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1833 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1834 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1835 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1836 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1837 * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes)
1838 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1839 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1840 *
1841 */
1842__kernel void YUYV422_to_NV12_bt709(
1843 IMAGE_DECLARATION(yuyv_input),
1844 IMAGE_DECLARATION(luma_output),
1845 IMAGE_DECLARATION(uv_output))
1846{
1847 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1848 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1849 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1850
1851 // handle 16 pixels every time, each line 8 pixels
1852 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1853 ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1854 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1855 vstore8(luma, 0, out_y.ptr);
1856
1857 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1858 ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1859 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1860 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1861
1862 uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
1863 vstore8(cbcr, 0, out_uv.ptr);
1864}
1865
1866/** Convert a UYVY image to NV12 using BT709 color space
1867 *
1868 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1869 * No offset.
1870 *
1871 * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8
1872 * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes)
1873 * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
1874 * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes)
1875 * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
1876 * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
1877 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
1878 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
1879 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
1880 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1881 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
1882 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
1883 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
1884 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
1885 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
1886 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1887 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
1888 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
1889 *
1890 */
1891__kernel void UYVY422_to_NV12_bt709(
1892 IMAGE_DECLARATION(input_uyvy),
1893 IMAGE_DECLARATION(luma),
1894 IMAGE_DECLARATION(uv))
1895{
1896 Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
1897 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
1898 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
1899
1900 // handle 16 pixels every time, each line 8 pixels
1901 const uchar16 uyvy_t = vload16(0, in.ptr);
1902 vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
1903
1904 const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
1905 vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
1906
1907 const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
1908 const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
1909 const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
1910 vstore8(cbcr, 0, out_uv.ptr);
1911}