blob: 01d8b90be95591151751c2155add37fe2de43be4 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2016, 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26/** Convert an RGB888 image to RGBX8888
27 *
28 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
29 * No offset.
30 *
31 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
33 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
34 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
35 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
36 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
37 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
38 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
39 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
40 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
41 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
42 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
43 */
44__kernel void RGB888_to_RGBA8888_bt709(
45 IMAGE_DECLARATION(input),
46 IMAGE_DECLARATION(output))
47{
48 Image in = CONVERT_TO_IMAGE_STRUCT(input);
49 Image out = CONVERT_TO_IMAGE_STRUCT(output);
50
51 // handle 16 pixels every time
52 uchar16 rgb_0 = vload16(0, in.ptr);
53 uchar16 rgb_1 = vload16(0, in.ptr + 16);
54 uchar16 rgb_2 = vload16(0, in.ptr + 32);
55
56 uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
Gian Marco Iodice040bffe2017-09-29 09:50:28 +010057 uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
59 uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
60
61 vstore16(rgba_0, 0, out.ptr);
62 vstore16(rgba_1, 0, out.ptr + 16);
63 vstore16(rgba_2, 0, out.ptr + 32);
64 vstore16(rgba_3, 0, out.ptr + 48);
65}
66
67/** Convert an RGB888 image to RGBX8888
68 *
69 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
70 * No offset.
71 *
72 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
73 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
74 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
75 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
76 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
77 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
78 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
79 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
80 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
81 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
82 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
83 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
84 */
85__kernel void RGBA8888_to_RGB888_bt709(
86 IMAGE_DECLARATION(input),
87 IMAGE_DECLARATION(output))
88{
89 Image in = CONVERT_TO_IMAGE_STRUCT(input);
90 Image out = CONVERT_TO_IMAGE_STRUCT(output);
91 // handle 16 pixels every time
92 uchar16 rgba_0 = vload16(0, in.ptr);
93 uchar16 rgba_1 = vload16(0, in.ptr + 16);
94 uchar16 rgba_2 = vload16(0, in.ptr + 32);
95 uchar16 rgba_3 = vload16(0, in.ptr + 48);
96
97 uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
98 uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
99 uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
100
101 vstore16(rgb_0, 0, out.ptr);
102 vstore16(rgb_1, 0, out.ptr + 16);
103 vstore16(rgb_2, 0, out.ptr + 32);
104}
105
106/** Convert a UYVY422 image to RGB888 using BT709 color space
107 *
108 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
109 * No offset.
110 *
111 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
112 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
113 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
114 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
115 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
116 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
117 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
118 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
119 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
120 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
121 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
122 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
123 */
124__kernel void UYVY422_to_RGB888_bt709(
125 IMAGE_DECLARATION(input),
126 IMAGE_DECLARATION(output))
127{
128 Image in = CONVERT_TO_IMAGE_STRUCT(input);
129 Image out = CONVERT_TO_IMAGE_STRUCT(output);
130
131 // handle 8 pixels every time
132 uchar16 uyvy = vload16(0, in.ptr);
133
134 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
135 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
136 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
137
138 float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
139 float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
140 float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
141
142 uchar8 r_0 = convert_uchar8_rtz(f_r);
143 uchar8 g_0 = convert_uchar8_rtz(f_g);
144 uchar8 b_0 = convert_uchar8_rtz(f_b);
145
146 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
147 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
148 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
149
150 vstore16(rgb_0, 0, out.ptr);
151 vstore8(rgb_1, 0, out.ptr + 16);
152}
153
154/** Convert a UYVY422 image to RGBX8888 using BT709 color space
155 *
156 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
157 * No offset.
158 *
159 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
160 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
161 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
162 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
163 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
164 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
165 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
166 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
167 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
169 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
171 */
172__kernel void UYVY422_to_RGBA8888_bt709(
173 IMAGE_DECLARATION(input),
174 IMAGE_DECLARATION(output))
175{
176 Image in = CONVERT_TO_IMAGE_STRUCT(input);
177 Image out = CONVERT_TO_IMAGE_STRUCT(output);
178
179 // handle 8 pixels every time
180 uchar16 uyvy = vload16(0, in.ptr);
181
182 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
183 char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
184 char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
185
186 float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
187 float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
188 float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
189
190 uchar8 r_0 = convert_uchar8_rtz(f_r);
191 uchar8 g_0 = convert_uchar8_rtz(f_g);
192 uchar8 b_0 = convert_uchar8_rtz(f_b);
193
194 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
195 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
196 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
197 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
198
199 vstore16(rgba_0, 0, out.ptr);
200 vstore16(rgba_1, 0, out.ptr + 16);
201}
202
203/** Convert a YUYV422 image to RGB888 using BT709 color space
204 *
205 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
206 * No offset.
207 *
208 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
209 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
210 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
211 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
212 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
213 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
214 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
215 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
216 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
217 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
218 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
219 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
220 */
221__kernel void YUYV422_to_RGB888_bt709(
222 IMAGE_DECLARATION(input),
223 IMAGE_DECLARATION(output))
224{
225 Image in = CONVERT_TO_IMAGE_STRUCT(input);
226 Image out = CONVERT_TO_IMAGE_STRUCT(output);
227
228 // handle 8 pixels every time
229 uchar16 uyvy = vload16(0, in.ptr);
230
231 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
232 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
233 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
234
235 float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
236 float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
237 float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
238
239 uchar8 r_0 = convert_uchar8_rtz(f_r);
240 uchar8 g_0 = convert_uchar8_rtz(f_g);
241 uchar8 b_0 = convert_uchar8_rtz(f_b);
242
243 uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
244 r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
245 uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
246
247 vstore16(rgb_0, 0, out.ptr);
248 vstore8(rgb_1, 0, out.ptr + 16);
249}
250
251/** Convert a YUYV422 image to RGBX8888 using BT709 color space
252 *
253 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
254 * No offset.
255 *
256 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
257 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
258 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
259 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
260 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
261 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
262 * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
263 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
264 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
265 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
266 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
267 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
268 */
269__kernel void YUYV422_to_RGBA8888_bt709(
270 IMAGE_DECLARATION(input),
271 IMAGE_DECLARATION(output))
272{
273 Image in = CONVERT_TO_IMAGE_STRUCT(input);
274 Image out = CONVERT_TO_IMAGE_STRUCT(output);
275
276 // handle 8 pixels every time
277 uchar16 uyvy = vload16(0, in.ptr);
278
279 uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
280 char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
281 char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
282
283 float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
284 float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
285 float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
286
287 uchar8 r_0 = convert_uchar8_rtz(f_r);
288 uchar8 g_0 = convert_uchar8_rtz(f_g);
289 uchar8 b_0 = convert_uchar8_rtz(f_b);
290
291 uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
292 r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
293 uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
294 r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
295
296 vstore16(rgba_0, 0, out.ptr);
297 vstore16(rgba_1, 0, out.ptr + 16);
298}
299
300/** Convert a RGB image to NV12 using BT709 color space
301 *
302 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
303 * No offset.
304 *
305 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
306 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
307 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
308 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
309 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
310 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
311 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
312 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
313 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
314 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
315 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
316 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
317 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
318 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
319 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
320 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
321 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
322 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
323 *
324 */
325__kernel void RGB888_to_NV12_bt709(
326 IMAGE_DECLARATION(input),
327 IMAGE_DECLARATION(luma),
328 IMAGE_DECLARATION(uv))
329{
330 Image in = CONVERT_TO_IMAGE_STRUCT(input);
331 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
332 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
333
334 // handle 4 pixels every time, two lines, each line for 2 pixels
335 // Read 2 pixel of the first line
336 uchar8 rgb_0 = vload8(0, in.ptr);
337 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
338 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
339 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
340
341 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
342 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
343 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
344
345 short2 i_y = convert_short2_rtz(f_y);
346 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
347 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
348
349 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
350 vstore2(luma_0, 0, out_y.ptr);
351
352 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
353 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
354
355 // Read 2 pixel of the second line
356 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
357 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
358 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
359 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
360
361 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
362 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
363 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
364
365 i_y = convert_short2_rtz(f_y);
366 i_u = convert_short2_rtz(f_u) + (short2)(128);
367 i_v = convert_short2_rtz(f_v) + (short2)(128);
368
369 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
370 vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
371
372 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
373 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
374 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
375 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
376
377 vstore2(cbcr, 0, out_uv.ptr);
378}
379
380/*
381 R'= Y' + 0.0000*U + 1.5748*V
382 G'= Y' - 0.1873*U - 0.4681*V
383 B'= Y' + 1.8556*U + 0.0000*V
384*/
385
386/** Convert an NV12 image to RGB888
387 *
388 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
389 * No offset.
390 *
391 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
392 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
393 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
394 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
395 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
396 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
397 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
398 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
399 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
400 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
401 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
402 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
403 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
404 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
405 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
406 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
407 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
408 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
409 */
410__kernel void NV12_to_RGB888_bt709(
411 IMAGE_DECLARATION(luma_input),
412 IMAGE_DECLARATION(uv_input),
413 IMAGE_DECLARATION(rgb_output))
414{
415 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
416 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
417 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
418
419 // handle 8 pixels every time, two lines, each line for 4 pixels
420 uchar4 luma_0 = vload4(0, in_luma.ptr);
421 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
422 uchar4 cbcr = vload4(0, in_uv.ptr);
423 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
424 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
425
426 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
427 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
428 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
429
430 float4 f_r = convert_float4(luma_0) + temp0;
431 float4 f_g = convert_float4(luma_0) + temp1;
432 float4 f_b = convert_float4(luma_0) + temp2;
433
434 uchar4 r_0 = convert_uchar4_rtz(f_r);
435 uchar4 g_0 = convert_uchar4_rtz(f_g);
436 uchar4 b_0 = convert_uchar4_rtz(f_b);
437
438 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
439 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
440 vstore8(rgb_0, 0, out_rgb.ptr);
441 vstore4(rgb_1, 0, out_rgb.ptr + 8);
442
443 f_r = convert_float4(luma_1) + temp0;
444 f_g = convert_float4(luma_1) + temp1;
445 f_b = convert_float4(luma_1) + temp2;
446
447 r_0 = convert_uchar4_rtz(f_r);
448 g_0 = convert_uchar4_rtz(f_g);
449 b_0 = convert_uchar4_rtz(f_b);
450
451 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
452 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
453 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
454 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
455}
456
457/** Convert a RGB image to YUV444 using BT709 color space
458 *
459 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
460 * No offset.
461 *
462 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
463 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
464 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
465 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
466 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
467 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
468 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
469 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
470 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
471 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
472 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
473 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
474 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
475 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
476 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
477 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
478 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
479 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
480 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
481 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
482 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
483 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
484 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
485 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
486 *
487 */
488__kernel void RGB888_to_YUV444_bt709(
489 IMAGE_DECLARATION(rgb_input),
490 IMAGE_DECLARATION(luma_output),
491 IMAGE_DECLARATION(u_output),
492 IMAGE_DECLARATION(v_output))
493{
494 // handle 4 pixels every time
495 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
496 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
497 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
498 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
499
500 // Read 4 pixel
501 uchar16 rgb_0 = vload16(0, in_rgb.ptr);
502 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
503 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
504 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
505
506 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
507 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
508 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
509
510 short4 i_y = convert_short4_rtz(f_y);
511 short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
512 short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
513
514 uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
515 vstore4(luma_0, 0, out_y.ptr);
516
517 uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
518 uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
519 vstore4(cb_0, 0, out_u.ptr);
520 vstore4(cr_0, 0, out_v.ptr);
521}
522
523/** Convert a RGB image to IYUV using BT709 color space
524 *
525 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
526 * No offset.
527 *
528 * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8
529 * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes)
530 * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
531 * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes)
532 * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
533 * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image
534 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
535 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
536 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
537 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
538 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
539 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
540 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
541 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
542 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
543 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
544 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
545 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
546 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
547 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
548 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
549 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
550 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
551 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
552 *
553 */
554__kernel void RGB888_to_IYUV_bt709(
555 IMAGE_DECLARATION(rgb_input),
556 IMAGE_DECLARATION(luma_output),
557 IMAGE_DECLARATION(u_output),
558 IMAGE_DECLARATION(v_output))
559{
560 // handle 4 pixels every time, two lines, each line for 2 pixels
561 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
562 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
563 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
564 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
565
566 // Read 2 pixel of the first line
567 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
568 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3);
569 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4);
570 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5);
571
572 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
573 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
574 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
575
576 short2 i_y = convert_short2_rtz(f_y);
577 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
578 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
579
580 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
581 vstore2(luma_0, 0, out_y.ptr);
582
583 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
584 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
585
586 // Read 2 pixel of the second line
587 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
588 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3);
589 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4);
590 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5);
591
592 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
593 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
594 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
595
596 i_y = convert_short2_rtz(f_y);
597 i_u = convert_short2_rtz(f_u) + (short2)(128);
598 i_v = convert_short2_rtz(f_v) + (short2)(128);
599
600 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
601 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
602
603 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
604 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
605 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
606 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
607 *out_u.ptr = cbcr.x;
608 *out_v.ptr = cbcr.y;
609}
610
611/** Convert a RGBA image to YUV444 using BT709 color space
612 *
613 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
614 * No offset.
615 *
616 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
617 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
618 * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
619 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
620 * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
621 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
622 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
623 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
624 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
625 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
626 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
627 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
628 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
629 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
630 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
631 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
632 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
633 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
634 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
635 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
636 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
637 * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes)
638 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
639 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
640 *
641 */
642__kernel void RGBA8888_to_YUV444_bt709(
643 IMAGE_DECLARATION(rgba_input),
644 IMAGE_DECLARATION(luma_output),
645 IMAGE_DECLARATION(u_output),
646 IMAGE_DECLARATION(v_output))
647{
648 // handle 4 pixels every time
649 Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
650 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
651 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
652 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
653
654 // Read 4 pixel
655 uchar16 rgb_0 = vload16(0, in_rgba.ptr);
656 uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
657 uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
658 uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
659
660 float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
661 float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
662 float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
663
664 short4 i_y = convert_short4(f_y);
665 short4 i_u = convert_short4(f_u) + (short4)(128);
666 short4 i_v = convert_short4(f_v) + (short4)(128);
667
668 uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
669 vstore4(luma_0, 0, out_y.ptr);
670
671 uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
672 uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
673 vstore4(cb_0, 0, out_u.ptr);
674 vstore4(cr_0, 0, out_v.ptr);
675}
676
677/** Convert a RGBA image to NV12 using BT709 color space
678 *
679 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
680 * No offset.
681 *
682 * @param[in] input_ptr Pointer to the source image. Supported Format: U8
683 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
684 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
685 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
686 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
687 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
688 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
689 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
690 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
691 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
692 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
693 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
694 * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8
695 * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes)
696 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
697 * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes)
698 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
699 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
700 *
701 */
702__kernel void RGBA8888_to_NV12_bt709(
703 IMAGE_DECLARATION(input),
704 IMAGE_DECLARATION(luma_output),
705 IMAGE_DECLARATION(uv_output))
706{
707 Image in = CONVERT_TO_IMAGE_STRUCT(input);
708 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
709 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
710
711 // Read 2 pixel of the first line
712 uchar8 rgb_0 = vload8(0, in.ptr);
713 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
714 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
715 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
716
717 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
718 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
719 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
720
721 short2 i_y = convert_short2_rtz(f_y);
722 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
723 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
724
725 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
726 vstore2(luma_0, 0, out_y.ptr);
727
728 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
729 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
730
731 // Read 2 pixel of the second line
732 uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
733 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
734 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
735 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
736
737 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
738 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
739 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
740
741 i_y = convert_short2_rtz(f_y);
742 i_u = convert_short2_rtz(f_u) + (short2)(128);
743 i_v = convert_short2_rtz(f_v) + (short2)(128);
744
745 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
746 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
747
748 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
749 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
750 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
751 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
752 vstore2(cbcr, 0, out_uv.ptr);
753}
754
755/** Convert a RGBA image to IYUV using BT709 color space
756 *
757 * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
758 * No offset.
759 *
760 * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8
761 * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes)
762 * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
763 * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes)
764 * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
765 * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image
766 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
767 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
768 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
769 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
770 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
771 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
772 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
773 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
774 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
775 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
776 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
777 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
778 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
779 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
780 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
781 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
782 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
783 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
784 *
785 */
786__kernel void RGBA8888_to_IYUV_bt709(
787 IMAGE_DECLARATION(rgba_input),
788 IMAGE_DECLARATION(luma_output),
789 IMAGE_DECLARATION(u_output),
790 IMAGE_DECLARATION(v_output))
791{
792 // handle 4 pixels every time, two lines, each line for 2 pixels
793 Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
794 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
795 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
796 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
797
798 // Read 2 pixel of the first line
799 uchar8 rgb_0 = vload8(0, in_rgb.ptr);
800 uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4);
801 uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5);
802 uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6);
803
804 float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
805 float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
806 float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
807
808 short2 i_y = convert_short2_rtz(f_y);
809 short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
810 short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
811
812 uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
813 vstore2(luma_0, 0, out_y.ptr);
814
815 uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
816 uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
817
818 // Read 2 pixel of the second line
819 uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
820 uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4);
821 uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5);
822 uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6);
823
824 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
825 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
826 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
827
828 i_y = convert_short2_rtz(f_y);
829 i_u = convert_short2_rtz(f_u) + (short2)(128);
830 i_v = convert_short2_rtz(f_v) + (short2)(128);
831
832 uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
833 vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
834
835 uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
836 uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
837 uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
838 ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
839 *out_u.ptr = cbcr.x;
840 *out_v.ptr = cbcr.y;
841}
842
843/** Convert an NV12 image to RGB8888
844 *
845 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
846 * No offset.
847 *
848 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
849 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
850 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
851 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
852 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
853 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
854 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
855 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
856 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
857 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
858 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
859 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
860 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
861 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
862 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
863 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
864 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
865 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
866 */
867__kernel void NV12_to_RGBA8888_bt709(
868 IMAGE_DECLARATION(luma_input),
869 IMAGE_DECLARATION(uv_input),
870 IMAGE_DECLARATION(rgb_output))
871{
872 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
873 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
874 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
875
876 uchar4 luma_0 = vload4(0, in_luma.ptr);
877 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
878 uchar4 cbcr = vload4(0, in_uv.ptr);
879 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
880 char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
881
882 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
883 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
884 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
885
886 float4 f_r = convert_float4(luma_0) + temp0;
887 float4 f_g = convert_float4(luma_0) + temp1;
888 float4 f_b = convert_float4(luma_0) + temp2;
889
890 uchar4 r_0 = convert_uchar4_rtz(f_r);
891 uchar4 g_0 = convert_uchar4_rtz(f_g);
892 uchar4 b_0 = convert_uchar4_rtz(f_b);
893
894 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
895 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
896 vstore8(rgb_0, 0, out_rgb.ptr);
897 vstore8(rgb_1, 0, out_rgb.ptr + 8);
898
899 f_r = convert_float4(luma_1) + temp0;
900 f_g = convert_float4(luma_1) + temp1;
901 f_b = convert_float4(luma_1) + temp2;
902
903 r_0 = convert_uchar4_rtz(f_r);
904 g_0 = convert_uchar4_rtz(f_g);
905 b_0 = convert_uchar4_rtz(f_b);
906
907 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
908 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
909 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
910 vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
911}
912
913/** Convert an NV12 image to IYUV
914 *
915 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
916 * No offset.
917 *
918 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
919 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
920 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
921 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
922 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
923 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
924 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
925 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
926 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
927 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
928 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
929 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
930 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
931 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
932 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
933 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
934 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
935 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
936 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
937 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
938 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
939 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
940 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
941 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
942 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
943 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
944 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
945 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
946 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
947 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
948 */
949__kernel void NV12_to_IYUV_bt709(
950 IMAGE_DECLARATION(luma_input),
951 IMAGE_DECLARATION(uv_input),
952 IMAGE_DECLARATION(luma_output),
953 IMAGE_DECLARATION(u_output),
954 IMAGE_DECLARATION(v_output))
955{
956 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
957 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
958 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
959 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
960 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
961
962 // handle 32 pixels every time, two lines, each line for 16 pixels
963 uchar16 luma_0 = vload16(0, in_y.ptr);
964 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
965 uchar16 cbcr = vload16(0, in_uv.ptr);
966 uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
967 uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
968
969 vstore16(luma_0, 0, out_y.ptr);
970 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
971 vstore8(cb, 0, out_u.ptr);
972 vstore8(cr, 0, out_v.ptr);
973}
974
975/** Convert an NV12 image to YUV444
976 *
977 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
978 * No offset.
979 *
980 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
981 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
982 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
983 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
984 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
985 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
986 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
987 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
988 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
989 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
990 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
991 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
992 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
993 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
994 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
995 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
996 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
997 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
998 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
999 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1000 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1001 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1002 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1003 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1004 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1005 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1006 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1007 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1008 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1009 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1010 */
1011__kernel void NV12_to_YUV444_bt709(
1012 IMAGE_DECLARATION(luma_input),
1013 IMAGE_DECLARATION(uv_input),
1014 IMAGE_DECLARATION(luma_output),
1015 IMAGE_DECLARATION(u_output),
1016 IMAGE_DECLARATION(v_output))
1017{
1018 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1019 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1020 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1021 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1022 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1023
1024 // handle 32 pixels every time, two lines, each line for 16 pixels
1025 uchar16 luma_0 = vload16(0, in_y.ptr);
1026 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1027 uchar16 cbcr = vload16(0, in_uv.ptr);
1028 uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1029 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1030 uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1031 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1032
1033 vstore16(luma_0, 0, out_y.ptr);
1034 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1035 vstore16(cb, 0, out_u.ptr);
1036 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1037 vstore16(cr, 0, out_v.ptr);
1038 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1039}
1040
1041/** Convert an NV21 image to RGB888
1042 *
1043 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1044 * No offset.
1045 *
1046 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1047 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1048 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1050 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1052 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1053 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1054 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1055 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1056 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1057 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1058 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1059 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1060 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1061 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1062 * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
1063 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1064 */
1065__kernel void NV21_to_RGB888_bt709(
1066 IMAGE_DECLARATION(luma_input),
1067 IMAGE_DECLARATION(uv_input),
1068 IMAGE_DECLARATION(rgb_output))
1069{
1070 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1071 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1072 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1073
1074 // handle 8 pixels every time, two lines, each line for 4 pixels
1075 uchar4 luma_0 = vload4(0, in_y.ptr);
1076 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1077 uchar4 cbcr = vload4(0, in_uv.ptr);
1078 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1079 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1080
1081 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1082 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1083 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1084
1085 float4 f_r = convert_float4(luma_0) + temp0;
1086 float4 f_g = convert_float4(luma_0) + temp1;
1087 float4 f_b = convert_float4(luma_0) + temp2;
1088
1089 uchar4 r_0 = convert_uchar4_rtz(f_r);
1090 uchar4 g_0 = convert_uchar4_rtz(f_g);
1091 uchar4 b_0 = convert_uchar4_rtz(f_b);
1092
1093 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1094 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1095 vstore8(rgb_0, 0, out_rgb.ptr);
1096 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1097
1098 f_r = convert_float4(luma_1) + temp0;
1099 f_g = convert_float4(luma_1) + temp1;
1100 f_b = convert_float4(luma_1) + temp2;
1101
1102 r_0 = convert_uchar4_rtz(f_r);
1103 g_0 = convert_uchar4_rtz(f_g);
1104 b_0 = convert_uchar4_rtz(f_b);
1105
1106 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1107 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1108 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1109 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1110}
1111
1112/** Convert an NV12 image to RGB8888
1113 *
1114 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1115 * No offset.
1116 *
1117 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1118 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1119 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1120 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1121 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1122 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1123 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1124 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1125 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1126 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1127 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1128 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1129 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1130 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1131 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1132 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1133 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1134 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1135 */
1136__kernel void NV21_to_RGBA8888_bt709(
1137 IMAGE_DECLARATION(luma_input),
1138 IMAGE_DECLARATION(uv_input),
1139 IMAGE_DECLARATION(rgba_output))
1140{
1141 Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
1142 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1143 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1144
1145 // handle 8 pixels every time, two lines, each line for 4 pixels
1146 uchar4 luma_0 = vload4(0, in_luma.ptr);
1147 uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
1148 uchar4 cbcr = vload4(0, in_uv.ptr);
1149 char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
1150 char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
1151
1152 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1153 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1154 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1155
1156 float4 f_r = convert_float4(luma_0) + temp0;
1157 float4 f_g = convert_float4(luma_0) + temp1;
1158 float4 f_b = convert_float4(luma_0) + temp2;
1159
1160 uchar4 r_0 = convert_uchar4_rtz(f_r);
1161 uchar4 g_0 = convert_uchar4_rtz(f_g);
1162 uchar4 b_0 = convert_uchar4_rtz(f_b);
1163
1164 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1165 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1166 vstore8(rgb_0, 0, out_rgb.ptr);
1167 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1168
1169 f_r = convert_float4(luma_1) + temp0;
1170 f_g = convert_float4(luma_1) + temp1;
1171 f_b = convert_float4(luma_1) + temp2;
1172
1173 r_0 = convert_uchar4_rtz(f_r);
1174 g_0 = convert_uchar4_rtz(f_g);
1175 b_0 = convert_uchar4_rtz(f_b);
1176
1177 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1178 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1179 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1180 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1181}
1182
1183/** Convert an NV21 image to YUV444
1184 *
1185 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1186 * No offset.
1187 *
1188 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1189 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1190 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1191 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1192 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1193 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1194 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1195 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1196 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1197 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1198 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1199 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1200 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1201 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1202 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1203 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1204 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1205 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1206 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1207 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1208 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1209 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1210 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1211 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1212 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1213 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1214 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1215 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1216 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1217 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1218 */
1219__kernel void NV21_to_YUV444_bt709(
1220 IMAGE_DECLARATION(luma_input),
1221 IMAGE_DECLARATION(uv_input),
1222 IMAGE_DECLARATION(luma_output),
1223 IMAGE_DECLARATION(u_output),
1224 IMAGE_DECLARATION(v_output))
1225{
1226 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1227 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1228 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1229 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1230 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1231
1232 // handle 32 pixels every time, two lines, each line for 16 pixels
1233 uchar16 luma_0 = vload16(0, in_y.ptr);
1234 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1235 uchar16 cbcr = vload16(0, in_uv.ptr);
1236 uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
1237 cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
1238 uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
1239 cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
1240
1241 vstore16(luma_0, 0, out_y.ptr);
1242 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1243 vstore16(cb, 0, out_u.ptr);
1244 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1245 vstore16(cr, 0, out_v.ptr);
1246 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1247}
1248
1249/** Convert an NV21 image to IYUV
1250 *
1251 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1252 * No offset.
1253 *
1254 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1255 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1256 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1257 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1258 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1259 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1260 * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8
1261 * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes)
1262 * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes)
1263 * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes)
1264 * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1265 * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image
1266 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1267 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1268 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1269 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1270 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1271 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1272 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1273 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1274 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1275 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1276 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1277 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1278 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1279 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1280 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1281 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1282 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1283 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1284 */
1285__kernel void NV21_to_IYUV_bt709(
1286 IMAGE_DECLARATION(luma_input),
1287 IMAGE_DECLARATION(uv_input),
1288 IMAGE_DECLARATION(luma_output),
1289 IMAGE_DECLARATION(u_output),
1290 IMAGE_DECLARATION(v_output))
1291{
1292 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1293 Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
1294 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1295 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1296 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1297
1298 uchar16 luma_0 = vload16(0, in_y.ptr);
1299 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1300 uchar16 cbcr = vload16(0, in_uv.ptr);
1301 uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
1302 uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
1303
1304 vstore16(luma_0, 0, out_y.ptr);
1305 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1306 vstore8(cb, 0, out_u.ptr);
1307 vstore8(cr, 0, out_v.ptr);
1308}
1309
1310/** Convert a UYVY image to IYUV using BT709 color space
1311 *
1312 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1313 * No offset.
1314 *
1315 * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8
1316 * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes)
1317 * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
1318 * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes)
1319 * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
1320 * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image
1321 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1322 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1323 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1324 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1325 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1326 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1327 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1328 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1329 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1330 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1331 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1332 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1333 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1334 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1335 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1336 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1337 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1338 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1339 *
1340 */
1341__kernel void UYVY422_to_IYUV_bt709(
1342 IMAGE_DECLARATION(uyvy_input),
1343 IMAGE_DECLARATION(luma_output),
1344 IMAGE_DECLARATION(u_output),
1345 IMAGE_DECLARATION(v_output))
1346{
1347 Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
1348 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1349 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1350 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1351
1352 // handle 16 pixels every time, each line 8 pixels
1353 uchar16 uyvy = vload16(0, in_uyvy.ptr);
1354 uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1355 ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1356 ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1357 vstore8(luma, 0, out_y.ptr);
1358
1359 uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
1360 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
1361 ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
1362 ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
1363 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1364
1365 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1366 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1367 vstore4(cb, 0, out_u.ptr);
1368 vstore4(cr, 0, out_v.ptr);
1369}
1370
1371/** Convert a YUYV image to IYUV using BT709 color space
1372 *
1373 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1374 * No offset.
1375 *
1376 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1377 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1378 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1379 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1380 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1381 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1382 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1383 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1384 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1385 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1386 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1387 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1388 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1389 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1390 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1391 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1392 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1393 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1394 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1395 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1396 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1397 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1398 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1399 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1400 *
1401 */
1402__kernel void YUYV422_to_IYUV_bt709(
1403 IMAGE_DECLARATION(yuyv_input),
1404 IMAGE_DECLARATION(luma_output),
1405 IMAGE_DECLARATION(u_output),
1406 IMAGE_DECLARATION(v_output))
1407{
1408 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1409 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1410 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1411 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1412
1413 // handle 16 pixels every time, each line 8 pixels
1414 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1415 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1416 ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1417 ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1418 vstore8(luma, 0, out_y.ptr);
1419
1420 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1421 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1422 ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
1423 ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
1424 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1425
1426 uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
1427 uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
1428 vstore4(cb, 0, out_u.ptr);
1429 vstore4(cr, 0, out_v.ptr);
1430}
1431
1432/** Convert an IYUV image to RGB888
1433 *
1434 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1435 * No offset.
1436 *
1437 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1438 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1439 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1440 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1441 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1442 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1443 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1444 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1445 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1446 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1447 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1448 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1449 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1450 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1451 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1452 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1453 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1454 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1455 * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8
1456 * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes)
1457 * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
1458 * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes)
1459 * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
1460 * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
1461 */
1462__kernel void IYUV_to_RGB888_bt709(
1463 IMAGE_DECLARATION(luma_input),
1464 IMAGE_DECLARATION(u_input),
1465 IMAGE_DECLARATION(v_input),
1466 IMAGE_DECLARATION(rgb_output))
1467{
1468 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1469 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1470 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1471 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
1472
1473 // handle 8 pixels every time, two lines, each line for 4 pixels
1474 uchar4 luma_0 = vload4(0, in_y.ptr);
1475 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1476 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1477 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1478 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1479
1480 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1481 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1482 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1483
1484 float4 f_r = convert_float4(luma_0) + temp0;
1485 float4 f_g = convert_float4(luma_0) + temp1;
1486 float4 f_b = convert_float4(luma_0) + temp2;
1487
1488 uchar4 r_0 = convert_uchar4_rtz(f_r);
1489 uchar4 g_0 = convert_uchar4_rtz(f_g);
1490 uchar4 b_0 = convert_uchar4_rtz(f_b);
1491
1492 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1493 uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1494 vstore8(rgb_0, 0, out_rgb.ptr);
1495 vstore4(rgb_1, 0, out_rgb.ptr + 8);
1496
1497 f_r = convert_float4(luma_1) + temp0;
1498 f_g = convert_float4(luma_1) + temp1;
1499 f_b = convert_float4(luma_1) + temp2;
1500
1501 r_0 = convert_uchar4_rtz(f_r);
1502 g_0 = convert_uchar4_rtz(f_g);
1503 b_0 = convert_uchar4_rtz(f_b);
1504
1505 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
1506 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
1507 vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
1508 vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
1509}
1510
1511/** Convert an IYUV image to RGB8888
1512 *
1513 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1514 * No offset.
1515 *
1516 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1517 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1518 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1519 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1520 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1521 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1522 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1523 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1524 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1525 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1526 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1527 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1528 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1529 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1530 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1531 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1532 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1533 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1534 * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8
1535 * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes)
1536 * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
1537 * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes)
1538 * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
1539 * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
1540 */
1541__kernel void IYUV_to_RGBA8888_bt709(
1542 IMAGE_DECLARATION(luma_input),
1543 IMAGE_DECLARATION(u_input),
1544 IMAGE_DECLARATION(v_input),
1545 IMAGE_DECLARATION(rgba_output))
1546{
1547 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1548 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1549 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1550 Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
1551
1552 // handle 8 pixels every time, two lines, each line for 4 pixels
1553 uchar4 luma_0 = vload4(0, in_y.ptr);
1554 uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
1555 uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
1556 char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
1557 char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
1558
1559 float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
1560 float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
1561 float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
1562
1563 float4 f_r = convert_float4(luma_0) + temp0;
1564 float4 f_g = convert_float4(luma_0) + temp1;
1565 float4 f_b = convert_float4(luma_0) + temp2;
1566
1567 uchar4 r_0 = convert_uchar4_rtz(f_r);
1568 uchar4 g_0 = convert_uchar4_rtz(f_g);
1569 uchar4 b_0 = convert_uchar4_rtz(f_b);
1570
1571 uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1572 uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1573 vstore8(rgb_0, 0, out_rgb.ptr);
1574 vstore8(rgb_1, 0, out_rgb.ptr + 8);
1575
1576 f_r = convert_float4(luma_1) + temp0;
1577 f_g = convert_float4(luma_1) + temp1;
1578 f_b = convert_float4(luma_1) + temp2;
1579
1580 r_0 = convert_uchar4_rtz(f_r);
1581 g_0 = convert_uchar4_rtz(f_g);
1582 b_0 = convert_uchar4_rtz(f_b);
1583
1584 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
1585 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
1586 vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
1587 vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
1588}
1589
1590/** Convert an IYUV image to YUV444
1591 *
1592 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1593 * No offset.
1594 *
1595 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1596 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1597 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1598 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1599 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1600 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1601 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1602 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1603 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1604 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1605 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1606 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1607 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1608 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1609 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1610 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1611 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1612 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1613 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1614 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1615 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1616 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1617 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1618 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1619 * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8
1620 * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes)
1621 * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes)
1622 * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1623 * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes)
1624 * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel
1625 * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8
1626 * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes)
1627 * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes)
1628 * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes)
1629 * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes)
1630 * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel
1631 *
1632 */
1633__kernel void IYUV_to_YUV444_bt709(
1634 IMAGE_DECLARATION(luma_input),
1635 IMAGE_DECLARATION(u_input),
1636 IMAGE_DECLARATION(v_input),
1637 IMAGE_DECLARATION(luma_output),
1638 IMAGE_DECLARATION(u_output),
1639 IMAGE_DECLARATION(v_output))
1640{
1641 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1642 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1643 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1644 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1645 Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
1646 Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
1647
1648 // handle 32 pixels every time, two lines, each line for 16 pixels
1649 uchar16 luma_0 = vload16(0, in_y.ptr);
1650 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1651 uchar8 cb_src = vload8(0, in_u.ptr);
1652 uchar8 cr_src = vload8(0, in_v.ptr);
1653 uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
1654 cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
1655 uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
1656 cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
1657
1658 vstore16(luma_0, 0, out_y.ptr);
1659 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1660 vstore16(cb, 0, out_u.ptr);
1661 vstore16(cb, 0, out_u.ptr + u_output_stride_y);
1662 vstore16(cr, 0, out_v.ptr);
1663 vstore16(cr, 0, out_v.ptr + v_output_stride_y);
1664}
1665
1666/** Convert an IYUV image to NV12
1667 *
1668 * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
1669 * No offset.
1670 *
1671 * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8
1672 * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes)
1673 * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes)
1674 * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes)
1675 * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
1676 * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image
1677 * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8
1678 * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes)
1679 * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes)
1680 * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes)
1681 * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes)
1682 * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel
1683 * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8
1684 * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes)
1685 * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes)
1686 * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes)
1687 * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes)
1688 * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel
1689 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1690 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1691 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1692 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1693 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1694 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1695 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1696 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1697 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1698 * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes)
1699 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1700 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1701 *
1702 */
1703__kernel void IYUV_to_NV12_bt709(
1704 IMAGE_DECLARATION(luma_input),
1705 IMAGE_DECLARATION(u_input),
1706 IMAGE_DECLARATION(v_input),
1707 IMAGE_DECLARATION(luma_output),
1708 IMAGE_DECLARATION(uv_output))
1709{
1710 Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input);
1711 Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input);
1712 Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input);
1713 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1714 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1715
1716 // handle 32 pixels every time, two lines, each line for 16 pixels
1717 uchar16 luma_0 = vload16(0, in_y.ptr);
1718 uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
1719 uchar8 cb = vload8(0, in_u.ptr);
1720 uchar8 cr = vload8(0, in_v.ptr);
1721 uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
1722 cr.s6, cb.s7, cr.s7);
1723
1724 vstore16(luma_0, 0, out_y.ptr);
1725 vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
1726 vstore16(cbcr, 0, out_uv.ptr);
1727}
1728
1729/** Convert a YUYV image to NV12 using BT709 color space
1730 *
1731 * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
1732 * No offset.
1733 *
1734 * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8
1735 * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes)
1736 * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
1737 * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes)
1738 * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
1739 * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image
1740 * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8
1741 * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes)
1742 * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes)
1743 * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1744 * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
1745 * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
1746 * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8
1747 * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes)
1748 * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes)
1749 * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes)
1750 * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
1751 * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel
1752 *
1753 */
1754__kernel void YUYV422_to_NV12_bt709(
1755 IMAGE_DECLARATION(yuyv_input),
1756 IMAGE_DECLARATION(luma_output),
1757 IMAGE_DECLARATION(uv_output))
1758{
1759 Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
1760 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
1761 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
1762
1763 // handle 16 pixels every time, each line 8 pixels
1764 uchar16 yuyv = vload16(0, in_yuyv.ptr);
1765 ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1766 uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1767 vstore8(luma, 0, out_y.ptr);
1768
1769 yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
1770 ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
1771 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
1772 vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
1773
1774 uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
1775 vstore8(cbcr, 0, out_uv.ptr);
1776}
1777
1778/** Convert a UYVY image to NV12 using BT709 color space
1779 *
1780 * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
1781 * No offset.
1782 *
1783 * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8
1784 * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes)
1785 * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
1786 * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes)
1787 * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
1788 * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
1789 * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8
1790 * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes)
1791 * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes)
1792 * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1793 * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes)
1794 * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
1795 * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8
1796 * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes)
1797 * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes)
1798 * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes)
1799 * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes)
1800 * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel
1801 *
1802 */
1803__kernel void UYVY422_to_NV12_bt709(
1804 IMAGE_DECLARATION(input_uyvy),
1805 IMAGE_DECLARATION(luma),
1806 IMAGE_DECLARATION(uv))
1807{
1808 Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
1809 Image out_y = CONVERT_TO_IMAGE_STRUCT(luma);
1810 Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
1811
1812 // handle 16 pixels every time, each line 8 pixels
1813 const uchar16 uyvy_t = vload16(0, in.ptr);
1814 vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
1815
1816 const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
1817 vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
1818
1819 const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
1820 const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
1821 const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
1822 vstore8(cbcr, 0, out_uv.ptr);
1823}