blob: 72c74930b02b819d5965a00ba608b130b979bb15 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Pablo Tellod2d5f752018-06-05 13:37:36 +01002 * Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** Convert an RGB888 image to RGBX8888
27 *
28 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
29 * No offset.
30 *
31 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
33 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
34 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
35 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
36 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
37 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
38 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
39 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
40 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
41 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
42 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
43 */
44__kernel void RGB888_to_RGBA8888_bt709(
45 IMAGE_DECLARATION(input),
46 IMAGE_DECLARATION(output))
47{
48 Image in = CONVERT_TO_IMAGE_STRUCT(input);
49 Image out = CONVERT_TO_IMAGE_STRUCT(output);
50
51 // handle 16 pixels every time
52 uchar16 rgb_0 = vload16(0, in.ptr);
53 uchar16 rgb_1 = vload16(0, in.ptr + 16);
54 uchar16 rgb_2 = vload16(0, in.ptr + 32);
55
56 uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
Gian Marco Iodice040bffe2017-09-29 09:50:28 +010057 uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
59 uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
60
61 vstore16(rgba_0, 0, out.ptr);
62 vstore16(rgba_1, 0, out.ptr + 16);
63 vstore16(rgba_2, 0, out.ptr + 32);
64 vstore16(rgba_3, 0, out.ptr + 48);
65}
66
67/** Convert an RGB888 image to RGBX8888
68 *
69 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
70 * No offset.
71 *
72 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
73 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
74 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
75 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
76 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
77 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
78 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
79 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
80 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
81 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
82 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
83 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
84 */
85__kernel void RGBA8888_to_RGB888_bt709(
86 IMAGE_DECLARATION(input),
87 IMAGE_DECLARATION(output))
88{
89 Image in = CONVERT_TO_IMAGE_STRUCT(input);
90 Image out = CONVERT_TO_IMAGE_STRUCT(output);
91 // handle 16 pixels every time
92 uchar16 rgba_0 = vload16(0, in.ptr);
93 uchar16 rgba_1 = vload16(0, in.ptr + 16);
94 uchar16 rgba_2 = vload16(0, in.ptr + 32);
95 uchar16 rgba_3 = vload16(0, in.ptr + 48);
96
97 uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
98 uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
99 uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
100
101 vstore16(rgb_0, 0, out.ptr);
102 vstore16(rgb_1, 0, out.ptr + 16);
103 vstore16(rgb_2, 0, out.ptr + 32);
104}
105
106/** Convert a UYVY422 image to RGB888 using BT709 color space
107 *
108 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
109 * No offset.
110 *
111 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
112 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
113 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
114 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
115 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
116 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
117 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
118 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
119 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
120 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
121 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
122 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
123 */
124__kernel void UYVY422_to_RGB888_bt709(
125 IMAGE_DECLARATION(input),
126 IMAGE_DECLARATION(output))
127{
128 Image in = CONVERT_TO_IMAGE_STRUCT(input);
129 Image out = CONVERT_TO_IMAGE_STRUCT(output);
130
131 // handle 8 pixels every time
132 uchar16 uyvy = vload16(0, in.ptr);
133
134 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
135 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
136 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
137
Pablo Tellod2d5f752018-06-05 13:37:36 +0100138 float8 red_coef_bt709 = (float8)(1.5748f);
139 float8 green_coef_bt709 = (float8)(-0.1873f);
140 float8 green_coef2_bt709 = (float8)(-0.4681f);
141 float8 blue_coef_bt709 = (float8)(1.8556f);
142 float8 lumav = convert_float8(luma);
143
144 float8 f_r = red_coef_bt709 * convert_float8(cr);
145 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
146 float8 f_b = blue_coef_bt709 * convert_float8(cb);
147
148 f_r += lumav;
149 f_g += lumav;
150 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100151
152 uchar8 r_0 = convert_uchar8_rtz(f_r);
153 uchar8 g_0 = convert_uchar8_rtz(f_g);
154 uchar8 b_0 = convert_uchar8_rtz(f_b);
155
156 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
157 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
158 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
159
160 vstore16(rgb_0, 0, out.ptr);
161 vstore8(rgb_1, 0, out.ptr + 16);
162}
163
164/** Convert a UYVY422 image to RGBX8888 using BT709 color space
165 *
166 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
167 * No offset.
168 *
169 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
170 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
171 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
172 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
173 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
174 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
175 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
176 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
177 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
178 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
179 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
180 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
181 */
182__kernel void UYVY422_to_RGBA8888_bt709(
183 IMAGE_DECLARATION(input),
184 IMAGE_DECLARATION(output))
185{
186 Image in = CONVERT_TO_IMAGE_STRUCT(input);
187 Image out = CONVERT_TO_IMAGE_STRUCT(output);
188
189 // handle 8 pixels every time
190 uchar16 uyvy = vload16(0, in.ptr);
191
192 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
193 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
194 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
195
Pablo Tellod2d5f752018-06-05 13:37:36 +0100196 float8 red_coef_bt709 = (float8)(1.5748f);
197 float8 green_coef_bt709 = (float8)(-0.1873f);
198 float8 green_coef2_bt709 = (float8)(-0.4681f);
199 float8 blue_coef_bt709 = (float8)(1.8556f);
200 float8 lumav = convert_float8(luma);
201
202 float8 f_r = red_coef_bt709 * convert_float8(cr);
203 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
204 float8 f_b = blue_coef_bt709 * convert_float8(cb);
205
206 f_r += lumav;
207 f_g += lumav;
208 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100209
210 uchar8 r_0 = convert_uchar8_rtz(f_r);
211 uchar8 g_0 = convert_uchar8_rtz(f_g);
212 uchar8 b_0 = convert_uchar8_rtz(f_b);
213
214 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
215 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
216 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
217 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
218
219 vstore16(rgba_0, 0, out.ptr);
220 vstore16(rgba_1, 0, out.ptr + 16);
221}
222
223/** Convert a YUYV422 image to RGB888 using BT709 color space
224 *
225 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
226 * No offset.
227 *
228 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
229 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
230 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
231 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
232 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
233 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
234 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
235 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
236 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
237 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
238 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
239 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
240 */
241__kernel void YUYV422_to_RGB888_bt709(
242 IMAGE_DECLARATION(input),
243 IMAGE_DECLARATION(output))
244{
245 Image in = CONVERT_TO_IMAGE_STRUCT(input);
246 Image out = CONVERT_TO_IMAGE_STRUCT(output);
247
248 // handle 8 pixels every time
249 uchar16 uyvy = vload16(0, in.ptr);
250
251 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
252 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
253 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
254
Pablo Tellod2d5f752018-06-05 13:37:36 +0100255 float8 red_coef_bt709 = (float8)(1.5748f);
256 float8 green_coef_bt709 = (float8)(-0.1873f);
257 float8 green_coef2_bt709 = (float8)(-0.4681f);
258 float8 blue_coef_bt709 = (float8)(1.8556f);
259 float8 lumav = convert_float8(luma);
260
261 float8 f_r = red_coef_bt709 * convert_float8(cr);
262 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
263 float8 f_b = blue_coef_bt709 * convert_float8(cb);
264
265 f_r += lumav;
266 f_g += lumav;
267 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268
269 uchar8 r_0 = convert_uchar8_rtz(f_r);
270 uchar8 g_0 = convert_uchar8_rtz(f_g);
271 uchar8 b_0 = convert_uchar8_rtz(f_b);
272
273 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
274 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
275 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
276
277 vstore16(rgb_0, 0, out.ptr);
278 vstore8(rgb_1, 0, out.ptr + 16);
279}
280
281/** Convert a YUYV422 image to RGBX8888 using BT709 color space
282 *
283 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
284 * No offset.
285 *
286 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
287 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
288 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
289 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
290 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
291 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
292 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
293 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
294 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
295 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
296 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
297 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
298 */
299__kernel void YUYV422_to_RGBA8888_bt709(
300 IMAGE_DECLARATION(input),
301 IMAGE_DECLARATION(output))
302{
303 Image in = CONVERT_TO_IMAGE_STRUCT(input);
304 Image out = CONVERT_TO_IMAGE_STRUCT(output);
305
306 // handle 8 pixels every time
307 uchar16 uyvy = vload16(0, in.ptr);
308
309 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
310 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
311 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
312
Pablo Tellod2d5f752018-06-05 13:37:36 +0100313 float8 red_coef_bt709 = (float8)(1.5748f);
314 float8 green_coef_bt709 = (float8)(-0.1873f);
315 float8 green_coef2_bt709 = (float8)(-0.4681f);
316 float8 blue_coef_bt709 = (float8)(1.8556f);
317 float8 lumav = convert_float8(luma);
318
319 float8 f_r = red_coef_bt709 * convert_float8(cr);
320 float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
321 float8 f_b = blue_coef_bt709 * convert_float8(cb);
322
323 f_r += lumav;
324 f_g += lumav;
325 f_b += lumav;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100326
327 uchar8 r_0 = convert_uchar8_rtz(f_r);
328 uchar8 g_0 = convert_uchar8_rtz(f_g);
329 uchar8 b_0 = convert_uchar8_rtz(f_b);
330
331 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
332 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
333 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
334 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
335
336 vstore16(rgba_0, 0, out.ptr);
337 vstore16(rgba_1, 0, out.ptr + 16);
338}
339
340/** Convert a RGB image to NV12 using BT709 color space
341 *
342 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
343 * No offset.
344 *
345 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
346 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
347 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
348 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
349 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
350 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
351 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
352 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
353 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
354 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
355 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
356 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
357 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
358 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
359 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
360 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
361 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
362 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
363 *
364 */
365__kernel void RGB888_to_NV12_bt709(
366 IMAGE_DECLARATION(input),
367 IMAGE_DECLARATION(luma),
368 IMAGE_DECLARATION(uv))
369{
370 Image in = CONVERT_TO_IMAGE_STRUCT(input);
371 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
372 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
373
374 // handle 4 pixels every time, two lines, each line for 2 pixels
375 // Read 2 pixel of the first line
376 uchar8 rgb_0 = vload8(0, in.ptr);
377 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
378 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
379 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
380
381 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
382 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
383 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
384
385 short2 i_y = convert_short2_rtz(f_y);
386 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
387 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
388
389 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
390 vstore2(luma_0, 0, out_y.ptr);
391
392 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
393 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
394
395 // Read 2 pixel of the second line
396 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
397 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
398 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
399 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
400
401 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
402 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
403 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
404
405 i_y = convert_short2_rtz(f_y);
406 i_u = convert_short2_rtz(f_u) + (short2)(128);
407 i_v = convert_short2_rtz(f_v) + (short2)(128);
408
409 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
410 vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
411
412 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
413 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
414 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
415 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
416
417 vstore2(cbcr, 0, out_uv.ptr);
418}
419
420/*
421 R'= Y' + 0.0000*U + 1.5748*V
422 G'= Y' - 0.1873*U - 0.4681*V
423 B'= Y' + 1.8556*U + 0.0000*V
424*/
425
426/** Convert an NV12 image to RGB888
427 *
428 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
429 * No offset.
430 *
431 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
432 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
433 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
434 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
435 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
436 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
437 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
438 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
439 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
440 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
441 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
442 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
443 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
444 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
445 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
446 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
447 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
448 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
449 */
450__kernel void NV12_to_RGB888_bt709(
451 IMAGE_DECLARATION(luma_input),
452 IMAGE_DECLARATION(uv_input),
453 IMAGE_DECLARATION(rgb_output))
454{
455 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
456 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
457 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
458
459 // handle 8 pixels every time, two lines, each line for 4 pixels
460 uchar4 luma_0 = vload4(0, in_luma.ptr);
461 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
462 uchar4 cbcr = vload4(0, in_uv.ptr);
463 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
464 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
465
466 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
467 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
468 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
469
470 float4 f_r = convert_float4(luma_0) + temp0;
471 float4 f_g = convert_float4(luma_0) + temp1;
472 float4 f_b = convert_float4(luma_0) + temp2;
473
474 uchar4 r_0 = convert_uchar4_rtz(f_r);
475 uchar4 g_0 = convert_uchar4_rtz(f_g);
476 uchar4 b_0 = convert_uchar4_rtz(f_b);
477
478 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
479 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
480 vstore8(rgb_0, 0, out_rgb.ptr);
481 vstore4(rgb_1, 0, out_rgb.ptr + 8);
482
483 f_r = convert_float4(luma_1) + temp0;
484 f_g = convert_float4(luma_1) + temp1;
485 f_b = convert_float4(luma_1) + temp2;
486
487 r_0 = convert_uchar4_rtz(f_r);
488 g_0 = convert_uchar4_rtz(f_g);
489 b_0 = convert_uchar4_rtz(f_b);
490
491 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
492 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
493 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
494 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
495}
496
497/** Convert a RGB image to YUV444 using BT709 color space
498 *
499 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
500 * No offset.
501 *
502 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
503 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
504 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
505 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
506 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
507 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
508 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
509 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
510 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
511 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
512 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
513 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
514 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
515 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
516 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
517 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
518 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
519 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
520 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
521 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
522 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
523 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
524 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
525 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
526 *
527 */
528__kernel void RGB888_to_YUV444_bt709(
529 IMAGE_DECLARATION(rgb_input),
530 IMAGE_DECLARATION(luma_output),
531 IMAGE_DECLARATION(u_output),
532 IMAGE_DECLARATION(v_output))
533{
534 // handle 4 pixels every time
535 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
536 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
537 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
538 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
539
540 // Read 4 pixel
541 uchar16 rgb_0 = vload16(0, in_rgb.ptr);
542 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
543 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
544 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
545
546 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
547 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
548 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
549
550 short4 i_y = convert_short4_rtz(f_y);
551 short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
552 short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
553
554 uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
555 vstore4(luma_0, 0, out_y.ptr);
556
557 uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
558 uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
559 vstore4(cb_0, 0, out_u.ptr);
560 vstore4(cr_0, 0, out_v.ptr);
561}
562
563/** Convert a RGB image to IYUV using BT709 color space
564 *
565 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
566 * No offset.
567 *
568 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
569 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
570 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
571 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
572 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
573 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
574 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
575 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
576 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
577 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
578 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
579 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
580 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
581 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
582 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
583 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
584 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
585 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
586 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
587 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
588 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
589 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
590 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
591 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
592 *
593 */
594__kernel void RGB888_to_IYUV_bt709(
595 IMAGE_DECLARATION(rgb_input),
596 IMAGE_DECLARATION(luma_output),
597 IMAGE_DECLARATION(u_output),
598 IMAGE_DECLARATION(v_output))
599{
600 // handle 4 pixels every time, two lines, each line for 2 pixels
601 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
602 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
603 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
604 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
605
606 // Read 2 pixel of the first line
607 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
608 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
609 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
610 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
611
612 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
613 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
614 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
615
616 short2 i_y = convert_short2_rtz(f_y);
617 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
618 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
619
620 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
621 vstore2(luma_0, 0, out_y.ptr);
622
623 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
624 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
625
626 // Read 2 pixel of the second line
627 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
628 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
629 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
630 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
631
632 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
633 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
634 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
635
636 i_y = convert_short2_rtz(f_y);
637 i_u = convert_short2_rtz(f_u) + (short2)(128);
638 i_v = convert_short2_rtz(f_v) + (short2)(128);
639
640 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
641 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
642
643 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
644 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
645 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
646 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
647 *out_u.ptr = cbcr.x;
648 *out_v.ptr = cbcr.y;
649}
650
651/** Convert a RGBA image to YUV444 using BT709 color space
652 *
653 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
654 * No offset.
655 *
656 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
657 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
658 * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
659 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
660 * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
661 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
662 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
663 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
664 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
665 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
666 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
667 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
668 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
669 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
670 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
671 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
672 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
673 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
674 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
675 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
676 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
677 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
678 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
679 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
680 *
681 */
682__kernel void RGBA8888_to_YUV444_bt709(
683 IMAGE_DECLARATION(rgba_input),
684 IMAGE_DECLARATION(luma_output),
685 IMAGE_DECLARATION(u_output),
686 IMAGE_DECLARATION(v_output))
687{
688 // handle 4 pixels every time
689 Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
690 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
691 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
692 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
693
694 // Read 4 pixel
695 uchar16 rgb_0 = vload16(0, in_rgba.ptr);
696 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
697 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
698 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
699
700 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
701 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
702 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
703
704 short4 i_y = convert_short4(f_y);
705 short4 i_u = convert_short4(f_u) + (short4)(128);
706 short4 i_v = convert_short4(f_v) + (short4)(128);
707
708 uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
709 vstore4(luma_0, 0, out_y.ptr);
710
711 uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
712 uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
713 vstore4(cb_0, 0, out_u.ptr);
714 vstore4(cr_0, 0, out_v.ptr);
715}
716
717/** Convert a RGBA image to NV12 using BT709 color space
718 *
719 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
720 * No offset.
721 *
722 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
723 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
724 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
725 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
726 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
727 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
728 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
729 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
730 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
731 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
732 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
733 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
734 * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8
735 * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes)
736 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
737 * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes)
738 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
739 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
740 *
741 */
742__kernel void RGBA8888_to_NV12_bt709(
743 IMAGE_DECLARATION(input),
744 IMAGE_DECLARATION(luma_output),
745 IMAGE_DECLARATION(uv_output))
746{
747 Image in = CONVERT_TO_IMAGE_STRUCT(input);
748 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
749 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
750
751 // Read 2 pixel of the first line
752 uchar8 rgb_0 = vload8(0, in.ptr);
753 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
754 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
755 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
756
757 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
758 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
759 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
760
761 short2 i_y = convert_short2_rtz(f_y);
762 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
763 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
764
765 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
766 vstore2(luma_0, 0, out_y.ptr);
767
768 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
769 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
770
771 // Read 2 pixel of the second line
772 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
773 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
774 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
775 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
776
777 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
778 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
779 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
780
781 i_y = convert_short2_rtz(f_y);
782 i_u = convert_short2_rtz(f_u) + (short2)(128);
783 i_v = convert_short2_rtz(f_v) + (short2)(128);
784
785 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
786 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
787
788 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
789 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
790 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
791 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
792 vstore2(cbcr, 0, out_uv.ptr);
793}
794
795/** Convert a RGBA image to IYUV using BT709 color space
796 *
797 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
798 * No offset.
799 *
800 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
801 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
802 * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
803 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
804 * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
805 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
806 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
807 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
808 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
809 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
810 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
811 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
812 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
813 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
814 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
815 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
816 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
817 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
818 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
819 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
820 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
821 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
822 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
823 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
824 *
825 */
826__kernel void RGBA8888_to_IYUV_bt709(
827 IMAGE_DECLARATION(rgba_input),
828 IMAGE_DECLARATION(luma_output),
829 IMAGE_DECLARATION(u_output),
830 IMAGE_DECLARATION(v_output))
831{
832 // handle 4 pixels every time, two lines, each line for 2 pixels
833 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
834 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
835 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
836 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
837
838 // Read 2 pixel of the first line
839 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
840 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
841 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
842 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
843
844 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
845 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
846 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
847
848 short2 i_y = convert_short2_rtz(f_y);
849 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
850 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
851
852 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
853 vstore2(luma_0, 0, out_y.ptr);
854
855 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
856 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
857
858 // Read 2 pixel of the second line
859 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
860 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
861 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
862 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
863
864 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
865 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
866 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
867
868 i_y = convert_short2_rtz(f_y);
869 i_u = convert_short2_rtz(f_u) + (short2)(128);
870 i_v = convert_short2_rtz(f_v) + (short2)(128);
871
872 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
873 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
874
875 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
876 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
877 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
878 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
879 *out_u.ptr = cbcr.x;
880 *out_v.ptr = cbcr.y;
881}
882
883/** Convert an NV12 image to RGB8888
884 *
885 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
886 * No offset.
887 *
888 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
889 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
890 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
891 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
892 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
893 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
894 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
895 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
896 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
897 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
898 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
899 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
900 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
901 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
902 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
903 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
904 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
905 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
906 */
907__kernel void NV12_to_RGBA8888_bt709(
908 IMAGE_DECLARATION(luma_input),
909 IMAGE_DECLARATION(uv_input),
910 IMAGE_DECLARATION(rgb_output))
911{
912 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
913 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
914 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
915
916 uchar4 luma_0 = vload4(0, in_luma.ptr);
917 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
918 uchar4 cbcr = vload4(0, in_uv.ptr);
919 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
920 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
921
922 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
923 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
924 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
925
926 float4 f_r = convert_float4(luma_0) + temp0;
927 float4 f_g = convert_float4(luma_0) + temp1;
928 float4 f_b = convert_float4(luma_0) + temp2;
929
930 uchar4 r_0 = convert_uchar4_rtz(f_r);
931 uchar4 g_0 = convert_uchar4_rtz(f_g);
932 uchar4 b_0 = convert_uchar4_rtz(f_b);
933
934 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
935 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
936 vstore8(rgb_0, 0, out_rgb.ptr);
937 vstore8(rgb_1, 0, out_rgb.ptr + 8);
938
939 f_r = convert_float4(luma_1) + temp0;
940 f_g = convert_float4(luma_1) + temp1;
941 f_b = convert_float4(luma_1) + temp2;
942
943 r_0 = convert_uchar4_rtz(f_r);
944 g_0 = convert_uchar4_rtz(f_g);
945 b_0 = convert_uchar4_rtz(f_b);
946
947 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
948 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
949 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
950 vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
951}
952
953/** Convert an NV12 image to IYUV
954 *
955 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
956 * No offset.
957 *
958 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
959 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
960 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
961 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
962 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
963 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
964 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
965 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
966 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
967 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
968 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
969 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
970 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
971 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
972 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
973 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
974 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
975 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
976 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
977 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
978 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
979 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
980 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
981 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
982 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
983 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
984 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
985 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
986 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
987 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
988 */
989__kernel void NV12_to_IYUV_bt709(
990 IMAGE_DECLARATION(luma_input),
991 IMAGE_DECLARATION(uv_input),
992 IMAGE_DECLARATION(luma_output),
993 IMAGE_DECLARATION(u_output),
994 IMAGE_DECLARATION(v_output))
995{
996 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
997 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
998 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
999 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1000 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1001
1002 // handle 32 pixels every time, two lines, each line for 16 pixels
1003 uchar16 luma_0 = vload16(0, in_y.ptr);
1004 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1005 uchar16 cbcr = vload16(0, in_uv.ptr);
1006 uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
1007 uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
1008
1009 vstore16(luma_0, 0, out_y.ptr);
1010 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1011 vstore8(cb, 0, out_u.ptr);
1012 vstore8(cr, 0, out_v.ptr);
1013}
1014
1015/** Convert an NV12 image to YUV444
1016 *
1017 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1018 * No offset.
1019 *
1020 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1021 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1022 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1023 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1024 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1025 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1026 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1027 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1028 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1029 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1030 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1031 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1032 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1033 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1034 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1035 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1036 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1037 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1038 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1039 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1040 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1041 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1042 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1043 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1044 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1045 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1046 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1047 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1048 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1049 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1050 */
1051__kernel void NV12_to_YUV444_bt709(
1052 IMAGE_DECLARATION(luma_input),
1053 IMAGE_DECLARATION(uv_input),
1054 IMAGE_DECLARATION(luma_output),
1055 IMAGE_DECLARATION(u_output),
1056 IMAGE_DECLARATION(v_output))
1057{
1058 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1059 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1060 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1061 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1062 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1063
1064 // handle 32 pixels every time, two lines, each line for 16 pixels
1065 uchar16 luma_0 = vload16(0, in_y.ptr);
1066 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1067 uchar16 cbcr = vload16(0, in_uv.ptr);
1068 uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1069 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1070 uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1071 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1072
1073 vstore16(luma_0, 0, out_y.ptr);
1074 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1075 vstore16(cb, 0, out_u.ptr);
1076 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1077 vstore16(cr, 0, out_v.ptr);
1078 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1079}
1080
1081/** Convert an NV21 image to RGB888
1082 *
1083 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1084 * No offset.
1085 *
1086 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1087 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1088 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1089 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1090 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1091 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1092 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1093 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1094 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1095 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1096 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1097 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1098 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1099 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1100 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1101 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1102 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
1103 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1104 */
1105__kernel void NV21_to_RGB888_bt709(
1106 IMAGE_DECLARATION(luma_input),
1107 IMAGE_DECLARATION(uv_input),
1108 IMAGE_DECLARATION(rgb_output))
1109{
1110 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1111 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1112 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1113
1114 // handle 8 pixels every time, two lines, each line for 4 pixels
1115 uchar4 luma_0 = vload4(0, in_y.ptr);
1116 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1117 uchar4 cbcr = vload4(0, in_uv.ptr);
1118 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1119 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1120
1121 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1122 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1123 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1124
1125 float4 f_r = convert_float4(luma_0) + temp0;
1126 float4 f_g = convert_float4(luma_0) + temp1;
1127 float4 f_b = convert_float4(luma_0) + temp2;
1128
1129 uchar4 r_0 = convert_uchar4_rtz(f_r);
1130 uchar4 g_0 = convert_uchar4_rtz(f_g);
1131 uchar4 b_0 = convert_uchar4_rtz(f_b);
1132
1133 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1134 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1135 vstore8(rgb_0, 0, out_rgb.ptr);
1136 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1137
1138 f_r = convert_float4(luma_1) + temp0;
1139 f_g = convert_float4(luma_1) + temp1;
1140 f_b = convert_float4(luma_1) + temp2;
1141
1142 r_0 = convert_uchar4_rtz(f_r);
1143 g_0 = convert_uchar4_rtz(f_g);
1144 b_0 = convert_uchar4_rtz(f_b);
1145
1146 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1147 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1148 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1149 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1150}
1151
1152/** Convert an NV12 image to RGB8888
1153 *
1154 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1155 * No offset.
1156 *
1157 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1158 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1159 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1160 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1161 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1162 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1163 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1164 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1165 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1166 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1167 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1168 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1169 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1170 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1171 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1172 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1173 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1174 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1175 */
1176__kernel void NV21_to_RGBA8888_bt709(
1177 IMAGE_DECLARATION(luma_input),
1178 IMAGE_DECLARATION(uv_input),
1179 IMAGE_DECLARATION(rgba_output))
1180{
1181 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
1182 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1183 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1184
1185 // handle 8 pixels every time, two lines, each line for 4 pixels
1186 uchar4 luma_0 = vload4(0, in_luma.ptr);
1187 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
1188 uchar4 cbcr = vload4(0, in_uv.ptr);
1189 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1190 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1191
1192 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1193 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1194 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1195
1196 float4 f_r = convert_float4(luma_0) + temp0;
1197 float4 f_g = convert_float4(luma_0) + temp1;
1198 float4 f_b = convert_float4(luma_0) + temp2;
1199
1200 uchar4 r_0 = convert_uchar4_rtz(f_r);
1201 uchar4 g_0 = convert_uchar4_rtz(f_g);
1202 uchar4 b_0 = convert_uchar4_rtz(f_b);
1203
1204 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1205 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1206 vstore8(rgb_0, 0, out_rgb.ptr);
1207 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1208
1209 f_r = convert_float4(luma_1) + temp0;
1210 f_g = convert_float4(luma_1) + temp1;
1211 f_b = convert_float4(luma_1) + temp2;
1212
1213 r_0 = convert_uchar4_rtz(f_r);
1214 g_0 = convert_uchar4_rtz(f_g);
1215 b_0 = convert_uchar4_rtz(f_b);
1216
1217 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1218 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1219 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1220 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1221}
1222
1223/** Convert an NV21 image to YUV444
1224 *
1225 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1226 * No offset.
1227 *
1228 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1229 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1230 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1231 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1232 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1233 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1234 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1235 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1236 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1237 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1238 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1239 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1240 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1241 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1242 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1243 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1244 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1245 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1246 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1247 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1248 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1249 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1250 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1251 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1252 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1253 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1254 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1255 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1256 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1257 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1258 */
1259__kernel void NV21_to_YUV444_bt709(
1260 IMAGE_DECLARATION(luma_input),
1261 IMAGE_DECLARATION(uv_input),
1262 IMAGE_DECLARATION(luma_output),
1263 IMAGE_DECLARATION(u_output),
1264 IMAGE_DECLARATION(v_output))
1265{
1266 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1267 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1268 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1269 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1270 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1271
1272 // handle 32 pixels every time, two lines, each line for 16 pixels
1273 uchar16 luma_0 = vload16(0, in_y.ptr);
1274 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1275 uchar16 cbcr = vload16(0, in_uv.ptr);
1276 uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1277 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1278 uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1279 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1280
1281 vstore16(luma_0, 0, out_y.ptr);
1282 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1283 vstore16(cb, 0, out_u.ptr);
1284 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1285 vstore16(cr, 0, out_v.ptr);
1286 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1287}
1288
1289/** Convert an NV21 image to IYUV
1290 *
1291 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1292 * No offset.
1293 *
1294 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1295 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1296 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1297 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1298 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1299 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1300 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1301 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1302 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1303 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1304 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1305 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1306 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1307 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1308 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1309 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1310 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1311 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1312 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1313 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1314 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1315 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1316 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1317 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1318 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1319 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1320 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1321 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1322 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1323 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1324 */
1325__kernel void NV21_to_IYUV_bt709(
1326 IMAGE_DECLARATION(luma_input),
1327 IMAGE_DECLARATION(uv_input),
1328 IMAGE_DECLARATION(luma_output),
1329 IMAGE_DECLARATION(u_output),
1330 IMAGE_DECLARATION(v_output))
1331{
1332 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1333 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1334 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1335 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1336 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1337
1338 uchar16 luma_0 = vload16(0, in_y.ptr);
1339 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1340 uchar16 cbcr = vload16(0, in_uv.ptr);
1341 uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
1342 uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
1343
1344 vstore16(luma_0, 0, out_y.ptr);
1345 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1346 vstore8(cb, 0, out_u.ptr);
1347 vstore8(cr, 0, out_v.ptr);
1348}
1349
1350/** Convert a UYVY image to IYUV using BT709 color space
1351 *
1352 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1353 * No offset.
1354 *
1355 * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8
1356 * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes)
1357 * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
1358 * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes)
1359 * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
1360 * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image
1361 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1362 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1363 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1364 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1365 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1366 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1367 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1368 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1369 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1370 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1371 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1372 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1373 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1374 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1375 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1376 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1377 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1378 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1379 *
1380 */
1381__kernel void UYVY422_to_IYUV_bt709(
1382 IMAGE_DECLARATION(uyvy_input),
1383 IMAGE_DECLARATION(luma_output),
1384 IMAGE_DECLARATION(u_output),
1385 IMAGE_DECLARATION(v_output))
1386{
1387 Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
1388 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1389 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1390 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1391
1392 // handle 16 pixels every time, each line 8 pixels
1393 uchar16 uyvy = vload16(0, in_uyvy.ptr);
1394 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1395 ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1396 ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1397 vstore8(luma, 0, out_y.ptr);
1398
1399 uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
1400 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1401 ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1402 ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1403 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1404
1405 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1406 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1407 vstore4(cb, 0, out_u.ptr);
1408 vstore4(cr, 0, out_v.ptr);
1409}
1410
1411/** Convert a YUYV image to IYUV using BT709 color space
1412 *
1413 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1414 * No offset.
1415 *
1416 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1417 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1418 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1419 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1420 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1421 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1422 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1423 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1424 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1425 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1426 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1427 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1428 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1429 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1430 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1431 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1432 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1433 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1434 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1435 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1436 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1437 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1438 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1439 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1440 *
1441 */
1442__kernel void YUYV422_to_IYUV_bt709(
1443 IMAGE_DECLARATION(yuyv_input),
1444 IMAGE_DECLARATION(luma_output),
1445 IMAGE_DECLARATION(u_output),
1446 IMAGE_DECLARATION(v_output))
1447{
1448 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1449 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1450 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1451 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1452
1453 // handle 16 pixels every time, each line 8 pixels
1454 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1455 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1456 ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1457 ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1458 vstore8(luma, 0, out_y.ptr);
1459
1460 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1461 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1462 ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1463 ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1464 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1465
1466 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1467 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1468 vstore4(cb, 0, out_u.ptr);
1469 vstore4(cr, 0, out_v.ptr);
1470}
1471
1472/** Convert an IYUV image to RGB888
1473 *
1474 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1475 * No offset.
1476 *
1477 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1478 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1479 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1480 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1481 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1482 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1483 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1484 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1485 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1486 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1487 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1488 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1489 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1490 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1491 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1492 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1493 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1494 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1495 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1496 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1497 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1498 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1499 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
1500 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1501 */
1502__kernel void IYUV_to_RGB888_bt709(
1503 IMAGE_DECLARATION(luma_input),
1504 IMAGE_DECLARATION(u_input),
1505 IMAGE_DECLARATION(v_input),
1506 IMAGE_DECLARATION(rgb_output))
1507{
1508 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1509 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1510 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1511 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1512
1513 // handle 8 pixels every time, two lines, each line for 4 pixels
1514 uchar4 luma_0 = vload4(0, in_y.ptr);
1515 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1516 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1517 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1518 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1519
1520 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1521 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1522 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1523
1524 float4 f_r = convert_float4(luma_0) + temp0;
1525 float4 f_g = convert_float4(luma_0) + temp1;
1526 float4 f_b = convert_float4(luma_0) + temp2;
1527
1528 uchar4 r_0 = convert_uchar4_rtz(f_r);
1529 uchar4 g_0 = convert_uchar4_rtz(f_g);
1530 uchar4 b_0 = convert_uchar4_rtz(f_b);
1531
1532 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1533 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1534 vstore8(rgb_0, 0, out_rgb.ptr);
1535 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1536
1537 f_r = convert_float4(luma_1) + temp0;
1538 f_g = convert_float4(luma_1) + temp1;
1539 f_b = convert_float4(luma_1) + temp2;
1540
1541 r_0 = convert_uchar4_rtz(f_r);
1542 g_0 = convert_uchar4_rtz(f_g);
1543 b_0 = convert_uchar4_rtz(f_b);
1544
1545 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1546 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1547 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1548 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1549}
1550
1551/** Convert an IYUV image to RGB8888
1552 *
1553 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1554 * No offset.
1555 *
1556 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1557 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1558 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1559 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1560 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1561 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1562 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1563 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1564 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1565 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1566 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1567 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1568 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1569 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1570 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1571 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1572 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1573 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1574 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1575 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1576 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1577 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1578 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1579 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1580 */
1581__kernel void IYUV_to_RGBA8888_bt709(
1582 IMAGE_DECLARATION(luma_input),
1583 IMAGE_DECLARATION(u_input),
1584 IMAGE_DECLARATION(v_input),
1585 IMAGE_DECLARATION(rgba_output))
1586{
1587 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1588 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1589 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1590 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1591
1592 // handle 8 pixels every time, two lines, each line for 4 pixels
1593 uchar4 luma_0 = vload4(0, in_y.ptr);
1594 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1595 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1596 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1597 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1598
1599 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1600 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1601 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1602
1603 float4 f_r = convert_float4(luma_0) + temp0;
1604 float4 f_g = convert_float4(luma_0) + temp1;
1605 float4 f_b = convert_float4(luma_0) + temp2;
1606
1607 uchar4 r_0 = convert_uchar4_rtz(f_r);
1608 uchar4 g_0 = convert_uchar4_rtz(f_g);
1609 uchar4 b_0 = convert_uchar4_rtz(f_b);
1610
1611 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1612 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1613 vstore8(rgb_0, 0, out_rgb.ptr);
1614 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1615
1616 f_r = convert_float4(luma_1) + temp0;
1617 f_g = convert_float4(luma_1) + temp1;
1618 f_b = convert_float4(luma_1) + temp2;
1619
1620 r_0 = convert_uchar4_rtz(f_r);
1621 g_0 = convert_uchar4_rtz(f_g);
1622 b_0 = convert_uchar4_rtz(f_b);
1623
1624 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1625 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1626 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1627 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1628}
1629
1630/** Convert an IYUV image to YUV444
1631 *
1632 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1633 * No offset.
1634 *
1635 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1636 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1637 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1638 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1639 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1640 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1641 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1642 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1643 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1644 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1645 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1646 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1647 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1648 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1649 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1650 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1651 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1652 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1653 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1654 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1655 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1656 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1657 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1658 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1659 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1660 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1661 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1662 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1663 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1664 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1665 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1666 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1667 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1668 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1669 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1670 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1671 *
1672 */
1673__kernel void IYUV_to_YUV444_bt709(
1674 IMAGE_DECLARATION(luma_input),
1675 IMAGE_DECLARATION(u_input),
1676 IMAGE_DECLARATION(v_input),
1677 IMAGE_DECLARATION(luma_output),
1678 IMAGE_DECLARATION(u_output),
1679 IMAGE_DECLARATION(v_output))
1680{
1681 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1682 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1683 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1684 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1685 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1686 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1687
1688 // handle 32 pixels every time, two lines, each line for 16 pixels
1689 uchar16 luma_0 = vload16(0, in_y.ptr);
1690 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1691 uchar8 cb_src = vload8(0, in_u.ptr);
1692 uchar8 cr_src = vload8(0, in_v.ptr);
1693 uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
1694 cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
1695 uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
1696 cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
1697
1698 vstore16(luma_0, 0, out_y.ptr);
1699 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1700 vstore16(cb, 0, out_u.ptr);
1701 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1702 vstore16(cr, 0, out_v.ptr);
1703 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1704}
1705
1706/** Convert an IYUV image to NV12
1707 *
1708 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1709 * No offset.
1710 *
1711 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1712 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1713 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1714 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1715 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1716 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1717 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1718 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1719 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1720 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1721 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1722 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1723 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1724 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1725 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1726 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1727 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1728 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1729 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1730 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1731 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1732 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1733 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1734 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1735 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1736 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1737 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1738 * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1739 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1740 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1741 *
1742 */
1743__kernel void IYUV_to_NV12_bt709(
1744 IMAGE_DECLARATION(luma_input),
1745 IMAGE_DECLARATION(u_input),
1746 IMAGE_DECLARATION(v_input),
1747 IMAGE_DECLARATION(luma_output),
1748 IMAGE_DECLARATION(uv_output))
1749{
1750 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1751 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1752 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1753 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1754 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1755
1756 // handle 32 pixels every time, two lines, each line for 16 pixels
1757 uchar16 luma_0 = vload16(0, in_y.ptr);
1758 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1759 uchar8 cb = vload8(0, in_u.ptr);
1760 uchar8 cr = vload8(0, in_v.ptr);
1761 uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
1762 cr.s6, cb.s7, cr.s7);
1763
1764 vstore16(luma_0, 0, out_y.ptr);
1765 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1766 vstore16(cbcr, 0, out_uv.ptr);
1767}
1768
1769/** Convert a YUYV image to NV12 using BT709 color space
1770 *
1771 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1772 * No offset.
1773 *
1774 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1775 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1776 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1777 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1778 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1779 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1780 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1781 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1782 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1783 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1784 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1785 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1786 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1787 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1788 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1789 * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes)
1790 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1791 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1792 *
1793 */
1794__kernel void YUYV422_to_NV12_bt709(
1795 IMAGE_DECLARATION(yuyv_input),
1796 IMAGE_DECLARATION(luma_output),
1797 IMAGE_DECLARATION(uv_output))
1798{
1799 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1800 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1801 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1802
1803 // handle 16 pixels every time, each line 8 pixels
1804 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1805 ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1806 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1807 vstore8(luma, 0, out_y.ptr);
1808
1809 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1810 ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1811 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1812 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1813
1814 uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
1815 vstore8(cbcr, 0, out_uv.ptr);
1816}
1817
1818/** Convert a UYVY image to NV12 using BT709 color space
1819 *
1820 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1821 * No offset.
1822 *
1823 * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8
1824 * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes)
1825 * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
1826 * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes)
1827 * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
1828 * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
1829 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
1830 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
1831 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
1832 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1833 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
1834 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
1835 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
1836 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
1837 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
1838 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1839 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
1840 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
1841 *
1842 */
1843__kernel void UYVY422_to_NV12_bt709(
1844 IMAGE_DECLARATION(input_uyvy),
1845 IMAGE_DECLARATION(luma),
1846 IMAGE_DECLARATION(uv))
1847{
1848 Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
1849 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
1850 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
1851
1852 // handle 16 pixels every time, each line 8 pixels
1853 const uchar16 uyvy_t = vload16(0, in.ptr);
1854 vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
1855
1856 const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
1857 vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
1858
1859 const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
1860 const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
1861 const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
1862 vstore8(cbcr, 0, out_uv.ptr);
1863}