Blame - arm_compute/core/NEON/NEColorConvertHelper.inl - ml/ComputeLibrary

blob: 9a9caefaab2a8410ff39e9d70a8ddf5d3f1763c4 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/Error.h"
				25	#include "arm_compute/core/Helpers.h"
				26	#include "arm_compute/core/IMultiImage.h"
				27	#include "arm_compute/core/Utils.h"
				28
				29	#include <arm_neon.h>
				30
				31	namespace
				32	{
				33	constexpr float red_coef_bt709 = 1.5748F;
				34	constexpr float green_coef_bt709 = -0.1873f;
				35	constexpr float green_coef2_bt709 = -0.4681f;
				36	constexpr float blue_coef_bt709 = 1.8556f;
				37
				38	constexpr float rgb2yuv_bt709_kr = 0.2126f;
				39	constexpr float rgb2yuv_bt709_kb = 0.0722f;
				40	// K_g = 1 - K_r - K_b
				41	constexpr float rgb2yuv_bt709_kg = 0.7152f;
				42	// C_u = 1 / (2 * (1 - K_b))
				43	constexpr float rgb2yuv_bt709_cu = 0.5389f;
				44	// C_v = 1 / (2 * (1 - K_r))
				45	constexpr float rgb2yuv_bt709_cv = 0.6350f;
				46
				47	inline void convert_uint8x16_to_float32x4x4(const uint8x16_t &in, float32x4x4_t &out)
				48	{
				49	const auto tmp1 = vmovl_u8(vget_low_u8(in));
				50	out.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
				51	out.val[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
				52	const auto tmp2 = vmovl_u8(vget_high_u8(in));
				53	out.val[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
				54	out.val[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
				55	}
				56
				57	inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
				58	{
				59	out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
				60	vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
				61	out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
				62	vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
				63	out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
				64	vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
				65	}
				66
				67	inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
				68	{
				69	const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
				70	vqmovn_u32(vcvtq_u32_f32(in.val[1])));
				71	const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
				72	vqmovn_u32(vcvtq_u32_f32(in.val[3])));
				73	out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
				74	}
				75
				76	inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
				77	float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
				78	{
				79	/*
				80	Y'= 0.2126R' + 0.7152G' + 0.0722*B'
				81	U'=-0.1146R' - 0.3854G' + 0.5000*B'
				82	V'= 0.5000R' - 0.4542G' - 0.0458*B'
				83	*/
				84	const auto c128 = vdupq_n_f32(128.f);
				85
				86	// Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
				87	yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
				88	yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
				89	yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
				90
				91	// U = (B - Y) / (2 * (1 - K_b))
				92	uvec = vsubq_f32(bvec, yvec);
				93	uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
				94
				95	// V = (R - Y) / (2 * (1 - K_r))
				96	vvec = vsubq_f32(rvec, yvec);
				97	vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
				98	}
				99
				100	inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
				101	float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
				102	{
				103	float32x4x3_t rgb1, rgb2;
				104
				105	// Compute: cb - 128 and cr - 128;
				106	const auto c128 = vdupq_n_f32(128.f);
				107	uvec_val = vsubq_f32(uvec_val, c128);
				108	vvec_val = vsubq_f32(vvec_val, c128);
				109
				110	// Compute:
				111	// r = 0.0000ff_u + 1.5748ff_v;
				112	// g = 0.1873ff_u - 0.4681ff_v;
				113	// b = 1.8556ff_u + 0.0000ff_v;
				114	const auto red = vmulq_n_f32(vvec_val, red_coef_bt709);
				115	const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709);
				116	const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
				117	vmulq_n_f32(vvec_val, green_coef2_bt709));
				118
				119	// Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
				120	// the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
				121	// and written back to memory using vst3 instruction
				122
				123	rgb1.val[0] = vaddq_f32(yvec_val, red);
				124	rgb1.val[1] = vaddq_f32(yvec_val, green);
				125	rgb1.val[2] = vaddq_f32(yvec_val, blue);
				126
				127	rgb2.val[0] = vaddq_f32(yyvec_val, red);
				128	rgb2.val[1] = vaddq_f32(yyvec_val, green);
				129	rgb2.val[2] = vaddq_f32(yyvec_val, blue);
				130
				131	uint8x8x3_t u8_rgb;
				132	convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
				133
				134	if(!alpha)
				135	{
				136	vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
				137	vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
				138	vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
				139	vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
				140	vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
				141	vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
				142	vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
				143	vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
				144	}
				145	else
				146	{
				147	uint8x8x4_t u8_rgba;
				148	u8_rgba.val[0] = u8_rgb.val[0];
				149	u8_rgba.val[1] = u8_rgb.val[1];
				150	u8_rgba.val[2] = u8_rgb.val[2];
				151	u8_rgba.val[3] = vdup_n_u8(255);
				152	vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
				153	vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
				154	vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
				155	vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
				156	vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
				157	vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
				158	vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
				159	vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
				160	}
				161	}
				162
				163	inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
				164	{
				165	uint8x16x3_t rgb;
				166
				167	if(alpha)
				168	{
				169	const auto tmp = vld4q_u8(ptr);
				170	rgb.val[0] = tmp.val[0];
				171	rgb.val[1] = tmp.val[1];
				172	rgb.val[2] = tmp.val[2];
				173	}
				174	else
				175	{
				176	rgb = vld3q_u8(ptr);
				177	}
				178
				179	return rgb;
				180	}
				181
				182	inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
				183	{
				184	// Convert the uint8x16_t to float32x4x4_t
				185	float32x4x4_t frvec_top, fgvec_top, fbvec_top;
				186	convert_uint8x16_to_float32x4x4(vec_top.val[0], frvec_top);
				187	convert_uint8x16_to_float32x4x4(vec_top.val[1], fgvec_top);
				188	convert_uint8x16_to_float32x4x4(vec_top.val[2], fbvec_top);
				189
				190	float32x4x4_t frvec_bottom, fgvec_bottom, fbvec_bottom;
				191	convert_uint8x16_to_float32x4x4(vec_bottom.val[0], frvec_bottom);
				192	convert_uint8x16_to_float32x4x4(vec_bottom.val[1], fgvec_bottom);
				193	convert_uint8x16_to_float32x4x4(vec_bottom.val[2], fbvec_bottom);
				194
				195	float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
				196	float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
				197
				198	for(auto i = 0; i < 4; ++i)
				199	{
				200	rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
				201	fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
				202	rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
				203	fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
				204	}
				205
				206	convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]);
				207	convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]);
				208	convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]);
				209	convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]);
				210	convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]);
				211	convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]);
				212	}
				213
				214	inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
				215	const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
				216	unsigned char const __restrict out_y_top, unsigned char const __restrict out_y_bottom,
				217	unsigned char *const __restrict out_uv)
				218	{
				219	uint8x16x3_t vec_top, vec_bottom;
				220	vec_top.val[0] = rvec_top;
				221	vec_top.val[1] = gvec_top;
				222	vec_top.val[2] = bvec_top;
				223	vec_bottom.val[0] = rvec_bottom;
				224	vec_bottom.val[1] = gvec_bottom;
				225	vec_bottom.val[2] = bvec_bottom;
				226
				227	rgb_to_yuv_conversion(vec_top, vec_bottom);
				228
				229	vst1q_u8(out_y_top, vec_top.val[0]);
				230	vst1q_u8(out_y_bottom, vec_bottom.val[0]);
				231
				232	const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
				233	const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
				234	const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
				235	const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
				236
				237	uint8x8x2_t uvvec;
				238	uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
				239	uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
				240
				241	vst2_u8(out_uv, uvvec);
				242	}
				243
				244	inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
				245	const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
				246	unsigned char const __restrict out_y_top, unsigned char const __restrict out_y_bottom,
				247	unsigned char *const __restrict out_u,
				248	unsigned char *const __restrict out_v)
				249	{
				250	uint8x16x3_t vec_top, vec_bottom;
				251	vec_top.val[0] = rvec_top;
				252	vec_top.val[1] = gvec_top;
				253	vec_top.val[2] = bvec_top;
				254	vec_bottom.val[0] = rvec_bottom;
				255	vec_bottom.val[1] = gvec_bottom;
				256	vec_bottom.val[2] = bvec_bottom;
				257
				258	rgb_to_yuv_conversion(vec_top, vec_bottom);
				259
				260	vst1q_u8(out_y_top, vec_top.val[0]);
				261	vst1q_u8(out_y_bottom, vec_bottom.val[0]);
				262
				263	const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
				264	const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
				265	const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
				266	vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
				267
				268	vst1_u8(out_u, vget_low_u8(uvvec));
				269	vst1_u8(out_v, vget_high_u8(uvvec));
				270	}
				271
				272	inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
				273	unsigned char *const __restrict out_y,
				274	unsigned char *const __restrict out_u,
				275	unsigned char *const __restrict out_v)
				276	{
				277	// Convert the uint8x16_t to float32x4x4_t
				278	float32x4x4_t frvec, fgvec, fbvec;
				279	convert_uint8x16_to_float32x4x4(rvec, frvec);
				280	convert_uint8x16_to_float32x4x4(gvec, fgvec);
				281	convert_uint8x16_to_float32x4x4(bvec, fbvec);
				282
				283	float32x4x4_t fyvec, fuvec, fvvec;
				284	for(auto i = 0; i < 4; ++i)
				285	{
				286	rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
				287	fyvec.val[i], fuvec.val[i], fvvec.val[i]);
				288	}
				289
				290	uint8x16_t yvec, uvec, vvec;
				291	convert_float32x4x4_to_unit8x16(fyvec, yvec);
				292	convert_float32x4x4_to_unit8x16(fuvec, uvec);
				293	convert_float32x4x4_to_unit8x16(fvvec, vvec);
				294
				295	vst1q_u8(out_y, yvec);
				296	vst1q_u8(out_u, uvec);
				297	vst1q_u8(out_v, vvec);
				298	}
				299	}
				300
				301	namespace arm_compute
				302	{
				303	void colorconvert_rgb_to_rgbx(const void __restrict input, void __restrict output, const Window &win)
				304	{
				305	ARM_COMPUTE_ERROR_ON(nullptr == input);
				306	ARM_COMPUTE_ERROR_ON(nullptr == output);
				307
				308	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				309	const auto output_ptr = static_cast<IImage *__restrict>(output);
				310
				311	Iterator in(input_ptr, win);
				312	Iterator out(output_ptr, win);
				313
				314	execute_window_loop(win, [&](const Coordinates & id)
				315	{
				316	const auto ta1 = vld3q_u8(in.ptr());
				317	uint8x16x4_t ta2;
				318	ta2.val[0] = ta1.val[0];
				319	ta2.val[1] = ta1.val[1];
				320	ta2.val[2] = ta1.val[2];
				321	ta2.val[3] = vdupq_n_u8(255);
				322	vst4q_u8(out.ptr(), ta2);
				323	},
				324	in, out);
				325	}
				326
				327	void colorconvert_rgbx_to_rgb(const void input, void output, const Window &win)
				328	{
				329	ARM_COMPUTE_ERROR_ON(nullptr == input);
				330	ARM_COMPUTE_ERROR_ON(nullptr == output);
				331
				332	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				333	const auto output_ptr = static_cast<IImage *__restrict>(output);
				334
				335	Iterator in(input_ptr, win);
				336	Iterator out(output_ptr, win);
				337
				338	execute_window_loop(win, [&](const Coordinates & id)
				339	{
				340	const auto ta1 = vld4q_u8(in.ptr());
				341	uint8x16x3_t ta2;
				342	ta2.val[0] = ta1.val[0];
				343	ta2.val[1] = ta1.val[1];
				344	ta2.val[2] = ta1.val[2];
				345	vst3q_u8(out.ptr(), ta2);
				346	},
				347	in, out);
				348	}
				349
				350	template <bool yuyv, bool alpha>
				351	void colorconvert_yuyv_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				352	{
				353	ARM_COMPUTE_ERROR_ON(nullptr == input);
				354	ARM_COMPUTE_ERROR_ON(nullptr == output);
				355
				356	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				357	const auto output_ptr = static_cast<IImage *__restrict>(output);
				358
				359	constexpr auto element_size = alpha ? 32 : 24;
				360	constexpr auto shift = yuyv ? 0 : 1;
				361
				362	Iterator in(input_ptr, win);
				363	Iterator out(output_ptr, win);
				364
				365	execute_window_loop(win, [&](const Coordinates & id)
				366	{
				367	float32x4x4_t uvec, yvec, vvec, yyvec;
				368	const auto ta = vld4q_u8(in.ptr());
				369	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				370	//ta.val[1] = U0 U2 U4 U6 ...
				371	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				372	//ta.val[3] = V0 V2 V4 V7 ...
				373
				374	// Convert the uint8x16x4_t to float32x4x4_t
				375	convert_uint8x16_to_float32x4x4(ta.val[0 + shift], yvec);
				376	convert_uint8x16_to_float32x4x4(ta.val[1 - shift], uvec);
				377	convert_uint8x16_to_float32x4x4(ta.val[2 + shift], yyvec);
				378	convert_uint8x16_to_float32x4x4(ta.val[3 - shift], vvec);
				379
				380	yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				381	yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				382	yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				383	yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				384	},
				385	in, out);
				386	}
				387
				388	template <bool uv, bool alpha>
				389	void colorconvert_nv12_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				390	{
				391	ARM_COMPUTE_ERROR_ON(nullptr == input);
				392	ARM_COMPUTE_ERROR_ON(nullptr == output);
				393	win.validate();
				394
				395	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				396	const auto output_ptr = static_cast<IImage *__restrict>(output);
				397
				398	constexpr auto element_size = alpha ? 32 : 24;
				399	const auto out_stride = output_ptr->info()->strides_in_bytes().y();
				400	constexpr auto shift = uv ? 0 : 1;
				401
				402	// UV's width and height are subsampled
				403	Window win_uv(win);
				404	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
				405	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				406	win_uv.validate();
				407
				408	Iterator in_y(input_ptr->plane(0), win);
				409	Iterator in_uv(input_ptr->plane(1), win_uv);
				410	Iterator out(output_ptr, win);
				411
				412	execute_window_loop(win, [&](const Coordinates & id)
				413	{
				414	const auto ta_y_top = vld2q_u8(in_y.ptr());
				415	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				416	const auto ta_uv = vld2q_u8(in_uv.ptr());
				417	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				418	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				419	//ta_uv.val[0] = U0 U2 U4 U6 ...
				420	//ta_uv.val[1] = V0 V2 V4 V6 ...
				421
				422	// Convert the uint8x16x4_t to float32x4x4_t
				423	float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
				424	convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
				425	convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
				426	convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
				427	convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
				428	convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift], uvec);
				429	convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift], vvec);
				430
				431	yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				432	yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				433	yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				434	yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				435
				436	yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
				437	yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
				438	yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
				439	yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
				440	},
				441	in_y, in_uv, out);
				442	}
				443
				444	template <bool alpha>
				445	void colorconvert_iyuv_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				446	{
				447	ARM_COMPUTE_ERROR_ON(nullptr == input);
				448	ARM_COMPUTE_ERROR_ON(nullptr == output);
				449	win.validate();
				450
				451	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				452	const auto output_ptr = static_cast<IImage *__restrict>(output);
				453
				454	constexpr auto element_size = alpha ? 32 : 24;
				455	const auto out_stride = output_ptr->info()->strides_in_bytes().y();
				456
				457	// UV's width and height are subsampled
				458	Window win_uv(win);
				459	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				460	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				461	win_uv.validate();
				462
				463	Iterator in_y(input_ptr->plane(0), win);
				464	Iterator in_u(input_ptr->plane(1), win_uv);
				465	Iterator in_v(input_ptr->plane(2), win_uv);
				466	Iterator out(output_ptr, win);
				467
				468	execute_window_loop(win, [&](const Coordinates & id)
				469	{
				470	const auto ta_y_top = vld2q_u8(in_y.ptr());
				471	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				472	const auto ta_u = vld1q_u8(in_u.ptr());
				473	const auto ta_v = vld1q_u8(in_v.ptr());
				474	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				475	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				476	//ta_u.val[0] = U0 U2 U4 U6 ...
				477	//ta_v.val[0] = V0 V2 V4 V6 ...
				478
				479	// Convert the uint8x16x4_t to float32x4x4_t
				480	float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
				481	convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
				482	convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
				483	convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
				484	convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
				485	convert_uint8x16_to_float32x4x4(ta_u, uvec);
				486	convert_uint8x16_to_float32x4x4(ta_v, vvec);
				487
				488	yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				489	yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				490	yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				491	yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				492
				493	yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
				494	yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
				495	yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
				496	yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
				497	},
				498	in_y, in_u, in_v, out);
				499	}
				500
				501	template <bool yuyv>
				502	void colorconvert_yuyv_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				503	{
				504	ARM_COMPUTE_ERROR_ON(nullptr == input);
				505	ARM_COMPUTE_ERROR_ON(nullptr == output);
				506	win.validate();
				507
				508	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				509	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				510
				511	constexpr auto shift = yuyv ? 0 : 1;
				512
				513	// NV12's UV's width and height are subsampled
				514	Window win_uv(win);
				515	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				516	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				517	win_uv.validate();
				518
				519	Iterator in(input_ptr, win);
				520	Iterator out_y(output_ptr->plane(0), win);
				521	Iterator out_uv(output_ptr->plane(1), win_uv);
				522
				523	execute_window_loop(win, [&](const Coordinates & id)
				524	{
				525	const auto ta_top = vld4q_u8(in.ptr());
				526	const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
				527	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				528	//ta.val[1] = U0 U2 U4 U6 ...
				529	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				530	//ta.val[3] = V0 V2 V4 V7 ...
				531
				532	uint8x16x2_t yvec;
				533	yvec.val[0] = ta_top.val[0 + shift];
				534	yvec.val[1] = ta_top.val[2 + shift];
				535	vst2q_u8(out_y.ptr(), yvec);
				536
				537	uint8x16x2_t yyvec;
				538	yyvec.val[0] = ta_bottom.val[0 + shift];
				539	yyvec.val[1] = ta_bottom.val[2 + shift];
				540	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
				541
				542	uint8x16x2_t uvvec;
				543	uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
				544	uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
				545	vst2q_u8(out_uv.ptr(), uvvec);
				546	},
				547	in, out_y, out_uv);
				548	}
				549
				550	void colorconvert_iyuv_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				551	{
				552	ARM_COMPUTE_ERROR_ON(nullptr == input);
				553	ARM_COMPUTE_ERROR_ON(nullptr == output);
				554	win.validate();
				555
				556	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				557	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				558
				559	// UV's width and height are subsampled
				560	Window win_uv(win);
				561	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				562	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				563	win_uv.validate();
				564
				565	Iterator in_y(input_ptr->plane(0), win);
				566	Iterator in_u(input_ptr->plane(1), win_uv);
				567	Iterator in_v(input_ptr->plane(2), win_uv);
				568	Iterator out_y(output_ptr->plane(0), win);
				569	Iterator out_uv(output_ptr->plane(1), win_uv);
				570
				571	execute_window_loop(win, [&](const Coordinates & id)
				572	{
				573	const auto ta_y_top = vld2q_u8(in_y.ptr());
				574	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				575	uint8x16x2_t ta_uv;
				576	ta_uv.val[0] = vld1q_u8(in_u.ptr());
				577	ta_uv.val[1] = vld1q_u8(in_v.ptr());
				578	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				579	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				580	//ta_uv.val[0] = U0 U2 U4 U6 ...
				581	//ta_uv.val[1] = V0 V2 V4 V6 ...
				582
				583	vst2q_u8(out_y.ptr(), ta_y_top);
				584	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				585	vst2q_u8(out_uv.ptr(), ta_uv);
				586	},
				587	in_y, in_u, in_v, out_y, out_uv);
				588	}
				589
				590	template <bool uv>
				591	void colorconvert_nv12_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				592	{
				593	ARM_COMPUTE_ERROR_ON(nullptr == input);
				594	ARM_COMPUTE_ERROR_ON(nullptr == output);
				595	win.validate();
				596
				597	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				598	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				599
				600	constexpr auto shift = uv ? 0 : 1;
				601
				602	// UV's width and height are subsampled
				603	Window win_uv(win);
				604	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				605	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				606	win_uv.validate();
				607
				608	Iterator in_y(input_ptr->plane(0), win);
				609	Iterator in_uv(input_ptr->plane(1), win_uv);
				610	Iterator out_y(output_ptr->plane(0), win);
				611	Iterator out_u(output_ptr->plane(1), win_uv);
				612	Iterator out_v(output_ptr->plane(2), win_uv);
				613
				614	execute_window_loop(win, [&](const Coordinates & id)
				615	{
				616	const auto ta_y_top = vld2q_u8(in_y.ptr());
				617	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				618	const auto ta_uv = vld2q_u8(in_uv.ptr());
				619	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				620	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				621	//ta_uv.val[0] = U0 U2 U4 U6 ...
				622	//ta_uv.val[1] = V0 V2 V4 V6 ...
				623
				624	vst2q_u8(out_y.ptr(), ta_y_top);
				625	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				626	vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
				627	vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
				628	},
				629	in_y, in_uv, out_y, out_u, out_v);
				630	}
				631
				632	template <bool yuyv>
				633	void colorconvert_yuyv_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				634	{
				635	ARM_COMPUTE_ERROR_ON(nullptr == input);
				636	ARM_COMPUTE_ERROR_ON(nullptr == output);
				637	win.validate();
				638
				639	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				640	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				641
				642	constexpr auto shift = yuyv ? 0 : 1;
				643
				644	// Destination's UV's width and height are subsampled
				645	Window win_uv(win);
				646	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				647	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				648	win_uv.validate();
				649
				650	Iterator in(input_ptr, win);
				651	Iterator out_y(output_ptr->plane(0), win);
				652	Iterator out_u(output_ptr->plane(1), win_uv);
				653	Iterator out_v(output_ptr->plane(2), win_uv);
				654
				655	execute_window_loop(win, [&](const Coordinates & id)
				656	{
				657	const auto ta_top = vld4q_u8(in.ptr());
				658	const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
				659	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				660	//ta.val[1] = U0 U2 U4 U6 ...
				661	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				662	//ta.val[3] = V0 V2 V4 V7 ...
				663
				664	uint8x16x2_t yvec;
				665	yvec.val[0] = ta_top.val[0 + shift];
				666	yvec.val[1] = ta_top.val[2 + shift];
				667	vst2q_u8(out_y.ptr(), yvec);
				668
				669	uint8x16x2_t yyvec;
				670	yyvec.val[0] = ta_bottom.val[0 + shift];
				671	yyvec.val[1] = ta_bottom.val[2 + shift];
				672	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
				673
				674	uint8x16_t uvec;
				675	uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
				676	vst1q_u8(out_u.ptr(), uvec);
				677
				678	uint8x16_t vvec;
				679	vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
				680	vst1q_u8(out_v.ptr(), vvec);
				681	},
				682	in, out_y, out_u, out_v);
				683	}
				684
				685	template <bool uv>
				686	void colorconvert_nv12_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				687	{
				688	ARM_COMPUTE_ERROR_ON(nullptr == input);
				689	ARM_COMPUTE_ERROR_ON(nullptr == output);
				690	win.validate();
				691
				692	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				693	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				694
				695	constexpr auto shift = uv ? 0 : 1;
				696
				697	// UV's width and height are subsampled
				698	Window win_uv(win);
				699	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				700	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				701	win_uv.validate();
				702
				703	Iterator in_y(input_ptr->plane(0), win);
				704	Iterator in_uv(input_ptr->plane(1), win_uv);
				705	Iterator out_y(output_ptr->plane(0), win);
				706	Iterator out_u(output_ptr->plane(1), win);
				707	Iterator out_v(output_ptr->plane(2), win);
				708
				709	execute_window_loop(win, [&](const Coordinates & id)
				710	{
				711	const auto ta_y_top = vld2q_u8(in_y.ptr());
				712	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				713	const auto ta_uv = vld2q_u8(in_uv.ptr());
				714	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				715	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				716	//ta_uv.val[0] = U0 U2 U4 U6 ...
				717	//ta_uv.val[1] = V0 V2 V4 V6 ...
				718
				719	vst2q_u8(out_y.ptr(), ta_y_top);
				720	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				721
				722	uint8x16x2_t uvec;
				723	uvec.val[0] = ta_uv.val[0 + shift];
				724	uvec.val[1] = ta_uv.val[0 + shift];
				725	vst2q_u8(out_u.ptr(), uvec);
				726	vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
				727
				728	uint8x16x2_t vvec;
				729	vvec.val[0] = ta_uv.val[1 - shift];
				730	vvec.val[1] = ta_uv.val[1 - shift];
				731	vst2q_u8(out_v.ptr(), vvec);
				732	vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
				733	},
				734	in_y, in_uv, out_y, out_u, out_v);
				735	}
				736
				737	void colorconvert_iyuv_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				738	{
				739	ARM_COMPUTE_ERROR_ON(nullptr == input);
				740	ARM_COMPUTE_ERROR_ON(nullptr == output);
				741	win.validate();
				742
				743	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				744	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				745
				746	// UV's width and height are subsampled
				747	Window win_uv(win);
				748	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				749	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				750	win_uv.validate();
				751
				752	Iterator in_y(input_ptr->plane(0), win);
				753	Iterator in_u(input_ptr->plane(1), win_uv);
				754	Iterator in_v(input_ptr->plane(2), win_uv);
				755	Iterator out_y(output_ptr->plane(0), win);
				756	Iterator out_u(output_ptr->plane(1), win);
				757	Iterator out_v(output_ptr->plane(2), win);
				758
				759	execute_window_loop(win, [&](const Coordinates & id)
				760	{
				761	const auto ta_y_top = vld2q_u8(in_y.ptr());
				762	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				763	const auto ta_u = vld1q_u8(in_u.ptr());
				764	const auto ta_v = vld1q_u8(in_v.ptr());
				765	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				766	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				767	//ta_u = U0 U2 U4 U6 ...
				768	//ta_v = V0 V2 V4 V6 ...
				769
				770	vst2q_u8(out_y.ptr(), ta_y_top);
				771	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				772
				773	uint8x16x2_t uvec;
				774	uvec.val[0] = ta_u;
				775	uvec.val[1] = ta_u;
				776	vst2q_u8(out_u.ptr(), uvec);
				777	vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
				778
				779	uint8x16x2_t vvec;
				780	vvec.val[0] = ta_v;
				781	vvec.val[1] = ta_v;
				782	vst2q_u8(out_v.ptr(), vvec);
				783	vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
				784	},
				785	in_y, in_u, in_v, out_y, out_u, out_v);
				786	}
				787
				788	template <bool alpha>
				789	void colorconvert_rgb_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				790	{
				791	ARM_COMPUTE_ERROR_ON(nullptr == input);
				792	ARM_COMPUTE_ERROR_ON(nullptr == output);
				793	win.validate();
				794
				795	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				796	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				797
				798	// UV's width and height are subsampled
				799	Window win_uv(win);
				800	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				801	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				802	win_uv.validate();
				803
				804	Iterator in(input_ptr, win);
				805	Iterator out_y(output_ptr->plane(0), win);
				806	Iterator out_uv(output_ptr->plane(1), win_uv);
				807
				808	execute_window_loop(win, [&](const Coordinates & id)
				809	{
				810	const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
				811	const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
				812	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				813	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				814	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				815
				816	store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
				817	ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
				818	out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
				819	out_uv.ptr());
				820	},
				821	in, out_y, out_uv);
				822	}
				823
				824	template <bool alpha>
				825	void colorconvert_rgb_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				826	{
				827	ARM_COMPUTE_ERROR_ON(nullptr == input);
				828	ARM_COMPUTE_ERROR_ON(nullptr == output);
				829	win.validate();
				830
				831	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				832	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				833
				834	// UV's width and height are subsampled
				835	Window win_uv(win);
				836	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				837	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				838	win_uv.validate();
				839
				840	Iterator in(input_ptr, win);
				841	Iterator out_y(output_ptr->plane(0), win);
				842	Iterator out_u(output_ptr->plane(1), win_uv);
				843	Iterator out_v(output_ptr->plane(2), win_uv);
				844
				845	execute_window_loop(win, [&](const Coordinates & id)
				846	{
				847	const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
				848	const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
				849	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				850	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				851	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				852
				853	store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
				854	ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
				855	out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
				856	out_u.ptr(), out_v.ptr());
				857	},
				858	in, out_y, out_u, out_v);
				859	}
				860
				861	template <bool alpha>
				862	void colorconvert_rgb_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				863	{
				864	ARM_COMPUTE_ERROR_ON(nullptr == input);
				865	ARM_COMPUTE_ERROR_ON(nullptr == output);
				866	win.validate();
				867
				868	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				869	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				870
				871	Iterator in(input_ptr, win);
				872	Iterator out_y(output_ptr->plane(0), win);
				873	Iterator out_u(output_ptr->plane(1), win);
				874	Iterator out_v(output_ptr->plane(2), win);
				875
				876	execute_window_loop(win, [&](const Coordinates & id)
				877	{
				878	const auto ta_rgb = load_rgb(in.ptr(), alpha);
				879	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				880	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				881	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				882
				883	store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
				884	out_y.ptr(), out_u.ptr(), out_v.ptr());
				885	},
				886	in, out_y, out_u, out_v);
				887	}
Gian Marco Iodice	356f643	2017-09-22 11:32:21 +0100	[diff] [blame^]	888	} // namespace arm_compute