Blame - arm_compute/core/NEON/NEColorConvertHelper.inl - ml/ComputeLibrary

blob: 7540d338307d22fe64cee94a2b43531341a086ff [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	2	* Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/Error.h"
				25	#include "arm_compute/core/Helpers.h"
				26	#include "arm_compute/core/IMultiImage.h"
				27	#include "arm_compute/core/Utils.h"
				28
				29	#include <arm_neon.h>
				30
				31	namespace
				32	{
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	33	#ifndef DOXYGEN_SKIP_THIS
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	34	constexpr float red_coef_bt709 = 1.5748F;
				35	constexpr float green_coef_bt709 = -0.1873f;
				36	constexpr float green_coef2_bt709 = -0.4681f;
				37	constexpr float blue_coef_bt709 = 1.8556f;
				38
				39	constexpr float rgb2yuv_bt709_kr = 0.2126f;
				40	constexpr float rgb2yuv_bt709_kb = 0.0722f;
				41	// K_g = 1 - K_r - K_b
				42	constexpr float rgb2yuv_bt709_kg = 0.7152f;
				43	// C_u = 1 / (2 * (1 - K_b))
				44	constexpr float rgb2yuv_bt709_cu = 0.5389f;
				45	// C_v = 1 / (2 * (1 - K_r))
				46	constexpr float rgb2yuv_bt709_cv = 0.6350f;
				47
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	48	constexpr float rgb2u8_red_coef = 0.2126f;
				49	constexpr float rgb2u8_green_coef = 0.7152f;
				50	constexpr float rgb2u8_blue_coef = 0.0722f;
				51
				52	inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	53	{
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	54	float32x4x4_t out;
				55	const auto tmp1 = vmovl_u8(vget_low_u8(in));
				56	out.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
				57	out.val[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
				58	const auto tmp2 = vmovl_u8(vget_high_u8(in));
				59	out.val[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
				60	out.val[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
				61	return out;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	62	}
				63
				64	inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
				65	{
				66	out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
				67	vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
				68	out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
				69	vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
				70	out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
				71	vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
				72	}
				73
				74	inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
				75	{
				76	const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
				77	vqmovn_u32(vcvtq_u32_f32(in.val[1])));
				78	const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
				79	vqmovn_u32(vcvtq_u32_f32(in.val[3])));
				80	out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
				81	}
				82
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	83	inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,const float32x4_t &gcolor, const float32x4_t &bcolor,
				84	const float rcoef, const float gcoef, const float bcoef)
				85	{
				86	float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
				87	greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef);
				88	greyscale = vmlaq_n_f32(greyscale, bcolor, bcoef);
				89	return greyscale;
				90	}
				91
				92	inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
				93	{
				94	float32x4x4_t out_float32;
				95
				96	//Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats
				97	const float32x4x4_t r_float32 = convert_uint8x16_to_float32x4x4(in.val[0]);
				98	const float32x4x4_t g_float32 = convert_uint8x16_to_float32x4x4(in.val[1]);
				99	const float32x4x4_t b_float32 = convert_uint8x16_to_float32x4x4(in.val[2]);
				100
				101	//New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) )
				102	//Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float
				103	out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0],
				104	rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
				105
				106	out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1],
				107	rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
				108
				109	out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2],
				110	rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
				111
				112	out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3],
				113	rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
				114
				115	//Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s
				116	convert_float32x4x4_to_unit8x16(out_float32, out);
				117	}
				118
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	119	inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
				120	float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
				121	{
				122	/*
				123	Y'= 0.2126R' + 0.7152G' + 0.0722*B'
				124	U'=-0.1146R' - 0.3854G' + 0.5000*B'
				125	V'= 0.5000R' - 0.4542G' - 0.0458*B'
				126	*/
				127	const auto c128 = vdupq_n_f32(128.f);
				128
				129	// Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
				130	yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
				131	yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
				132	yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
				133
				134	// U = (B - Y) / (2 * (1 - K_b))
				135	uvec = vsubq_f32(bvec, yvec);
				136	uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
				137
				138	// V = (R - Y) / (2 * (1 - K_r))
				139	vvec = vsubq_f32(rvec, yvec);
				140	vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
				141	}
				142
				143	inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
				144	float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
				145	{
				146	float32x4x3_t rgb1, rgb2;
				147
				148	// Compute: cb - 128 and cr - 128;
				149	const auto c128 = vdupq_n_f32(128.f);
				150	uvec_val = vsubq_f32(uvec_val, c128);
				151	vvec_val = vsubq_f32(vvec_val, c128);
				152
				153	// Compute:
				154	// r = 0.0000ff_u + 1.5748ff_v;
				155	// g = 0.1873ff_u - 0.4681ff_v;
				156	// b = 1.8556ff_u + 0.0000ff_v;
				157	const auto red = vmulq_n_f32(vvec_val, red_coef_bt709);
				158	const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709);
				159	const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
				160	vmulq_n_f32(vvec_val, green_coef2_bt709));
				161
				162	// Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
				163	// the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
				164	// and written back to memory using vst3 instruction
				165
				166	rgb1.val[0] = vaddq_f32(yvec_val, red);
				167	rgb1.val[1] = vaddq_f32(yvec_val, green);
				168	rgb1.val[2] = vaddq_f32(yvec_val, blue);
				169
				170	rgb2.val[0] = vaddq_f32(yyvec_val, red);
				171	rgb2.val[1] = vaddq_f32(yyvec_val, green);
				172	rgb2.val[2] = vaddq_f32(yyvec_val, blue);
				173
				174	uint8x8x3_t u8_rgb;
				175	convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
				176
				177	if(!alpha)
				178	{
				179	vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
				180	vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
				181	vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
				182	vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
				183	vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
				184	vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
				185	vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
				186	vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
				187	}
				188	else
				189	{
				190	uint8x8x4_t u8_rgba;
				191	u8_rgba.val[0] = u8_rgb.val[0];
				192	u8_rgba.val[1] = u8_rgb.val[1];
				193	u8_rgba.val[2] = u8_rgb.val[2];
				194	u8_rgba.val[3] = vdup_n_u8(255);
				195	vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
				196	vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
				197	vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
				198	vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
				199	vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
				200	vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
				201	vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
				202	vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
				203	}
				204	}
				205
				206	inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
				207	{
				208	uint8x16x3_t rgb;
				209
				210	if(alpha)
				211	{
				212	const auto tmp = vld4q_u8(ptr);
				213	rgb.val[0] = tmp.val[0];
				214	rgb.val[1] = tmp.val[1];
				215	rgb.val[2] = tmp.val[2];
				216	}
				217	else
				218	{
				219	rgb = vld3q_u8(ptr);
				220	}
				221
				222	return rgb;
				223	}
				224
				225	inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
				226	{
				227	// Convert the uint8x16_t to float32x4x4_t
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	228	const float32x4x4_t frvec_top = convert_uint8x16_to_float32x4x4(vec_top.val[0]);
				229	const float32x4x4_t fgvec_top = convert_uint8x16_to_float32x4x4(vec_top.val[1]);
				230	const float32x4x4_t fbvec_top = convert_uint8x16_to_float32x4x4(vec_top.val[2]);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	231
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	232	const float32x4x4_t frvec_bottom = convert_uint8x16_to_float32x4x4(vec_bottom.val[0]);
				233	const float32x4x4_t fgvec_bottom = convert_uint8x16_to_float32x4x4(vec_bottom.val[1]);
				234	const float32x4x4_t fbvec_bottom = convert_uint8x16_to_float32x4x4(vec_bottom.val[2]);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	235
				236	float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
				237	float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
				238
				239	for(auto i = 0; i < 4; ++i)
				240	{
				241	rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
				242	fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
				243	rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
				244	fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
				245	}
				246
				247	convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]);
				248	convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]);
				249	convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]);
				250	convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]);
				251	convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]);
				252	convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]);
				253	}
				254
				255	inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
				256	const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
				257	unsigned char const __restrict out_y_top, unsigned char const __restrict out_y_bottom,
				258	unsigned char *const __restrict out_uv)
				259	{
				260	uint8x16x3_t vec_top, vec_bottom;
				261	vec_top.val[0] = rvec_top;
				262	vec_top.val[1] = gvec_top;
				263	vec_top.val[2] = bvec_top;
				264	vec_bottom.val[0] = rvec_bottom;
				265	vec_bottom.val[1] = gvec_bottom;
				266	vec_bottom.val[2] = bvec_bottom;
				267
				268	rgb_to_yuv_conversion(vec_top, vec_bottom);
				269
				270	vst1q_u8(out_y_top, vec_top.val[0]);
				271	vst1q_u8(out_y_bottom, vec_bottom.val[0]);
				272
				273	const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
				274	const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
				275	const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
				276	const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
				277
				278	uint8x8x2_t uvvec;
				279	uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
				280	uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
				281
				282	vst2_u8(out_uv, uvvec);
				283	}
				284
				285	inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
				286	const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
				287	unsigned char const __restrict out_y_top, unsigned char const __restrict out_y_bottom,
				288	unsigned char *const __restrict out_u,
				289	unsigned char *const __restrict out_v)
				290	{
				291	uint8x16x3_t vec_top, vec_bottom;
				292	vec_top.val[0] = rvec_top;
				293	vec_top.val[1] = gvec_top;
				294	vec_top.val[2] = bvec_top;
				295	vec_bottom.val[0] = rvec_bottom;
				296	vec_bottom.val[1] = gvec_bottom;
				297	vec_bottom.val[2] = bvec_bottom;
				298
				299	rgb_to_yuv_conversion(vec_top, vec_bottom);
				300
				301	vst1q_u8(out_y_top, vec_top.val[0]);
				302	vst1q_u8(out_y_bottom, vec_bottom.val[0]);
				303
				304	const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
				305	const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
				306	const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
				307	vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
				308
				309	vst1_u8(out_u, vget_low_u8(uvvec));
				310	vst1_u8(out_v, vget_high_u8(uvvec));
				311	}
				312
				313	inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
				314	unsigned char *const __restrict out_y,
				315	unsigned char *const __restrict out_u,
				316	unsigned char *const __restrict out_v)
				317	{
				318	// Convert the uint8x16_t to float32x4x4_t
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	319	const float32x4x4_t frvec = convert_uint8x16_to_float32x4x4(rvec);
				320	const float32x4x4_t fgvec = convert_uint8x16_to_float32x4x4(gvec);
				321	const float32x4x4_t fbvec = convert_uint8x16_to_float32x4x4(bvec);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	322
				323	float32x4x4_t fyvec, fuvec, fvvec;
				324	for(auto i = 0; i < 4; ++i)
				325	{
				326	rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
				327	fyvec.val[i], fuvec.val[i], fvvec.val[i]);
				328	}
				329
				330	uint8x16_t yvec, uvec, vvec;
				331	convert_float32x4x4_to_unit8x16(fyvec, yvec);
				332	convert_float32x4x4_to_unit8x16(fuvec, uvec);
				333	convert_float32x4x4_to_unit8x16(fvvec, vvec);
				334
				335	vst1q_u8(out_y, yvec);
				336	vst1q_u8(out_u, uvec);
				337	vst1q_u8(out_v, vvec);
				338	}
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	339	#endif /* DOXYGEN_SKIP_THIS */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	340	}
				341
				342	namespace arm_compute
				343	{
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	344	/** Convert RGB to RGBX.
				345	*
				346	* @param[in] input Input RGB data buffer.
				347	* @param[out] output Output RGBX buffer.
				348	* @param[in] win Window for iterating the buffers.
				349	*
				350	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	351	void colorconvert_rgb_to_rgbx(const void __restrict input, void __restrict output, const Window &win)
				352	{
				353	ARM_COMPUTE_ERROR_ON(nullptr == input);
				354	ARM_COMPUTE_ERROR_ON(nullptr == output);
				355
				356	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				357	const auto output_ptr = static_cast<IImage *__restrict>(output);
				358
				359	Iterator in(input_ptr, win);
				360	Iterator out(output_ptr, win);
				361
				362	execute_window_loop(win, [&](const Coordinates & id)
				363	{
				364	const auto ta1 = vld3q_u8(in.ptr());
				365	uint8x16x4_t ta2;
				366	ta2.val[0] = ta1.val[0];
				367	ta2.val[1] = ta1.val[1];
				368	ta2.val[2] = ta1.val[2];
				369	ta2.val[3] = vdupq_n_u8(255);
				370	vst4q_u8(out.ptr(), ta2);
				371	},
				372	in, out);
				373	}
				374
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	375	/** Convert RGB to U8.
				376	*
				377	* @param[in] input Input RGB data buffer.
				378	* @param[out] output Output U8 buffer.
				379	* @param[in] win Window for iterating the buffers.
				380	*
				381	*/
				382	void colorconvert_rgb_to_u8(const void __restrict input, void __restrict output, const Window &win)
				383	{
				384	ARM_COMPUTE_ERROR_ON(nullptr == input);
				385	ARM_COMPUTE_ERROR_ON(nullptr == output);
				386
				387	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				388	const auto output_ptr = static_cast<IImage *__restrict>(output);
				389
				390	Iterator in(input_ptr, win);
				391	Iterator out(output_ptr, win);
				392
				393	execute_window_loop(win, [&](const Coordinates & id)
				394	{
				395	const auto ta1 = vld3q_u8(in.ptr());
				396	uint8x16_t ta2;
				397	rgb_to_u8_conversion(ta1, ta2);
				398	vst1q_u8(out.ptr(), ta2);
				399	},
				400	in, out);
				401	}
				402
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	403	/** Convert RGBX to RGB.
				404	*
				405	* @param[in] input Input RGBX data buffer.
				406	* @param[out] output Output RGB buffer.
				407	* @param[in] win Window for iterating the buffers.
				408	*
				409	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	410	void colorconvert_rgbx_to_rgb(const void input, void output, const Window &win)
				411	{
				412	ARM_COMPUTE_ERROR_ON(nullptr == input);
				413	ARM_COMPUTE_ERROR_ON(nullptr == output);
				414
				415	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				416	const auto output_ptr = static_cast<IImage *__restrict>(output);
				417
				418	Iterator in(input_ptr, win);
				419	Iterator out(output_ptr, win);
				420
				421	execute_window_loop(win, [&](const Coordinates & id)
				422	{
				423	const auto ta1 = vld4q_u8(in.ptr());
				424	uint8x16x3_t ta2;
				425	ta2.val[0] = ta1.val[0];
				426	ta2.val[1] = ta1.val[1];
				427	ta2.val[2] = ta1.val[2];
				428	vst3q_u8(out.ptr(), ta2);
				429	},
				430	in, out);
				431	}
				432
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	433	/** Convert YUYV to RGB.
				434	*
				435	* @param[in] input Input YUYV data buffer.
				436	* @param[out] output Output RGB buffer.
				437	* @param[in] win Window for iterating the buffers.
				438	*
				439	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	440	template <bool yuyv, bool alpha>
				441	void colorconvert_yuyv_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				442	{
				443	ARM_COMPUTE_ERROR_ON(nullptr == input);
				444	ARM_COMPUTE_ERROR_ON(nullptr == output);
				445
				446	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				447	const auto output_ptr = static_cast<IImage *__restrict>(output);
				448
				449	constexpr auto element_size = alpha ? 32 : 24;
				450	constexpr auto shift = yuyv ? 0 : 1;
				451
				452	Iterator in(input_ptr, win);
				453	Iterator out(output_ptr, win);
				454
				455	execute_window_loop(win, [&](const Coordinates & id)
				456	{
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	457	const auto ta = vld4q_u8(in.ptr());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	458	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				459	//ta.val[1] = U0 U2 U4 U6 ...
				460	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				461	//ta.val[3] = V0 V2 V4 V7 ...
				462
				463	// Convert the uint8x16x4_t to float32x4x4_t
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	464	const float32x4x4_t yvec = convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
				465	const float32x4x4_t uvec = convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
				466	const float32x4x4_t yyvec = convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
				467	const float32x4x4_t vvec = convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	468
				469	yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				470	yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				471	yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				472	yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				473	},
				474	in, out);
				475	}
				476
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	477	/** Convert NV12 to RGB.
				478	*
				479	* @param[in] input Input NV12 data buffer.
				480	* @param[out] output Output RGB buffer.
				481	* @param[in] win Window for iterating the buffers.
				482	*
				483	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	484	template <bool uv, bool alpha>
				485	void colorconvert_nv12_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				486	{
				487	ARM_COMPUTE_ERROR_ON(nullptr == input);
				488	ARM_COMPUTE_ERROR_ON(nullptr == output);
				489	win.validate();
				490
				491	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				492	const auto output_ptr = static_cast<IImage *__restrict>(output);
				493
				494	constexpr auto element_size = alpha ? 32 : 24;
				495	const auto out_stride = output_ptr->info()->strides_in_bytes().y();
				496	constexpr auto shift = uv ? 0 : 1;
				497
				498	// UV's width and height are subsampled
				499	Window win_uv(win);
				500	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
				501	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				502	win_uv.validate();
				503
				504	Iterator in_y(input_ptr->plane(0), win);
				505	Iterator in_uv(input_ptr->plane(1), win_uv);
				506	Iterator out(output_ptr, win);
				507
				508	execute_window_loop(win, [&](const Coordinates & id)
				509	{
				510	const auto ta_y_top = vld2q_u8(in_y.ptr());
				511	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				512	const auto ta_uv = vld2q_u8(in_uv.ptr());
				513	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				514	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				515	//ta_uv.val[0] = U0 U2 U4 U6 ...
				516	//ta_uv.val[1] = V0 V2 V4 V6 ...
				517
				518	// Convert the uint8x16x4_t to float32x4x4_t
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	519	float32x4x4_t yvec_top = convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
				520	float32x4x4_t yyvec_top = convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
				521	float32x4x4_t yvec_bottom = convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
				522	float32x4x4_t yyvec_bottom = convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
				523	float32x4x4_t uvec = convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
				524	float32x4x4_t vvec = convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	525
				526	yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				527	yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				528	yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				529	yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				530
				531	yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
				532	yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
				533	yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
				534	yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
				535	},
				536	in_y, in_uv, out);
				537	}
				538
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	539	/** Convert IYUV to RGB.
				540	*
				541	* @param[in] input Input IYUV data buffer.
				542	* @param[out] output Output RGB buffer.
				543	* @param[in] win Window for iterating the buffers.
				544	*
				545	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	546	template <bool alpha>
				547	void colorconvert_iyuv_to_rgb(const void __restrict input, void __restrict output, const Window &win)
				548	{
				549	ARM_COMPUTE_ERROR_ON(nullptr == input);
				550	ARM_COMPUTE_ERROR_ON(nullptr == output);
				551	win.validate();
				552
				553	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				554	const auto output_ptr = static_cast<IImage *__restrict>(output);
				555
				556	constexpr auto element_size = alpha ? 32 : 24;
				557	const auto out_stride = output_ptr->info()->strides_in_bytes().y();
				558
				559	// UV's width and height are subsampled
				560	Window win_uv(win);
				561	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				562	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				563	win_uv.validate();
				564
				565	Iterator in_y(input_ptr->plane(0), win);
				566	Iterator in_u(input_ptr->plane(1), win_uv);
				567	Iterator in_v(input_ptr->plane(2), win_uv);
				568	Iterator out(output_ptr, win);
				569
				570	execute_window_loop(win, [&](const Coordinates & id)
				571	{
				572	const auto ta_y_top = vld2q_u8(in_y.ptr());
				573	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				574	const auto ta_u = vld1q_u8(in_u.ptr());
				575	const auto ta_v = vld1q_u8(in_v.ptr());
				576	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				577	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				578	//ta_u.val[0] = U0 U2 U4 U6 ...
				579	//ta_v.val[0] = V0 V2 V4 V6 ...
				580
				581	// Convert the uint8x16x4_t to float32x4x4_t
Manuel Bottini	4284bfa	2018-09-26 15:33:15 +0100	[diff] [blame^]	582	float32x4x4_t yvec_top = convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
				583	float32x4x4_t yyvec_top = convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
				584	float32x4x4_t yvec_bottom = convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
				585	float32x4x4_t yyvec_bottom = convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
				586	float32x4x4_t uvec = convert_uint8x16_to_float32x4x4(ta_u);
				587	float32x4x4_t vvec = convert_uint8x16_to_float32x4x4(ta_v);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	588
				589	yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
				590	yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
				591	yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
				592	yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
				593
				594	yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
				595	yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
				596	yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
				597	yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
				598	},
				599	in_y, in_u, in_v, out);
				600	}
				601
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	602	/** Convert YUYV to NV12.
				603	*
				604	* @param[in] input Input YUYV data buffer.
				605	* @param[out] output Output NV12 buffer.
				606	* @param[in] win Window for iterating the buffers.
				607	*
				608	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	609	template <bool yuyv>
				610	void colorconvert_yuyv_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				611	{
				612	ARM_COMPUTE_ERROR_ON(nullptr == input);
				613	ARM_COMPUTE_ERROR_ON(nullptr == output);
				614	win.validate();
				615
				616	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				617	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				618
				619	constexpr auto shift = yuyv ? 0 : 1;
				620
				621	// NV12's UV's width and height are subsampled
				622	Window win_uv(win);
				623	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				624	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				625	win_uv.validate();
				626
				627	Iterator in(input_ptr, win);
				628	Iterator out_y(output_ptr->plane(0), win);
				629	Iterator out_uv(output_ptr->plane(1), win_uv);
				630
				631	execute_window_loop(win, [&](const Coordinates & id)
				632	{
				633	const auto ta_top = vld4q_u8(in.ptr());
				634	const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
				635	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				636	//ta.val[1] = U0 U2 U4 U6 ...
				637	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				638	//ta.val[3] = V0 V2 V4 V7 ...
				639
				640	uint8x16x2_t yvec;
				641	yvec.val[0] = ta_top.val[0 + shift];
				642	yvec.val[1] = ta_top.val[2 + shift];
				643	vst2q_u8(out_y.ptr(), yvec);
				644
				645	uint8x16x2_t yyvec;
				646	yyvec.val[0] = ta_bottom.val[0 + shift];
				647	yyvec.val[1] = ta_bottom.val[2 + shift];
				648	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
				649
				650	uint8x16x2_t uvvec;
				651	uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
				652	uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
				653	vst2q_u8(out_uv.ptr(), uvvec);
				654	},
				655	in, out_y, out_uv);
				656	}
				657
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	658	/** Convert IYUV to NV12.
				659	*
				660	* @param[in] input Input IYUV data buffer.
				661	* @param[out] output Output NV12 buffer.
				662	* @param[in] win Window for iterating the buffers.
				663	*
				664	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	665	void colorconvert_iyuv_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				666	{
				667	ARM_COMPUTE_ERROR_ON(nullptr == input);
				668	ARM_COMPUTE_ERROR_ON(nullptr == output);
				669	win.validate();
				670
				671	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				672	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				673
				674	// UV's width and height are subsampled
				675	Window win_uv(win);
				676	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				677	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				678	win_uv.validate();
				679
				680	Iterator in_y(input_ptr->plane(0), win);
				681	Iterator in_u(input_ptr->plane(1), win_uv);
				682	Iterator in_v(input_ptr->plane(2), win_uv);
				683	Iterator out_y(output_ptr->plane(0), win);
				684	Iterator out_uv(output_ptr->plane(1), win_uv);
				685
				686	execute_window_loop(win, [&](const Coordinates & id)
				687	{
				688	const auto ta_y_top = vld2q_u8(in_y.ptr());
				689	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				690	uint8x16x2_t ta_uv;
				691	ta_uv.val[0] = vld1q_u8(in_u.ptr());
				692	ta_uv.val[1] = vld1q_u8(in_v.ptr());
				693	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				694	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				695	//ta_uv.val[0] = U0 U2 U4 U6 ...
				696	//ta_uv.val[1] = V0 V2 V4 V6 ...
				697
				698	vst2q_u8(out_y.ptr(), ta_y_top);
				699	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				700	vst2q_u8(out_uv.ptr(), ta_uv);
				701	},
				702	in_y, in_u, in_v, out_y, out_uv);
				703	}
				704
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	705	/** Convert NV12 to IYUV.
				706	*
				707	* @param[in] input Input NV12 data buffer.
				708	* @param[out] output Output IYUV buffer.
				709	* @param[in] win Window for iterating the buffers.
				710	*
				711	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	712	template <bool uv>
				713	void colorconvert_nv12_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				714	{
				715	ARM_COMPUTE_ERROR_ON(nullptr == input);
				716	ARM_COMPUTE_ERROR_ON(nullptr == output);
				717	win.validate();
				718
				719	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				720	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				721
				722	constexpr auto shift = uv ? 0 : 1;
				723
				724	// UV's width and height are subsampled
				725	Window win_uv(win);
				726	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				727	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				728	win_uv.validate();
				729
				730	Iterator in_y(input_ptr->plane(0), win);
				731	Iterator in_uv(input_ptr->plane(1), win_uv);
				732	Iterator out_y(output_ptr->plane(0), win);
				733	Iterator out_u(output_ptr->plane(1), win_uv);
				734	Iterator out_v(output_ptr->plane(2), win_uv);
				735
				736	execute_window_loop(win, [&](const Coordinates & id)
				737	{
				738	const auto ta_y_top = vld2q_u8(in_y.ptr());
				739	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				740	const auto ta_uv = vld2q_u8(in_uv.ptr());
				741	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				742	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				743	//ta_uv.val[0] = U0 U2 U4 U6 ...
				744	//ta_uv.val[1] = V0 V2 V4 V6 ...
				745
				746	vst2q_u8(out_y.ptr(), ta_y_top);
				747	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				748	vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
				749	vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
				750	},
				751	in_y, in_uv, out_y, out_u, out_v);
				752	}
				753
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	754	/** Convert YUYV to IYUV.
				755	*
				756	* @param[in] input Input YUYV data buffer.
				757	* @param[out] output Output IYUV buffer.
				758	* @param[in] win Window for iterating the buffers.
				759	*
				760	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	761	template <bool yuyv>
				762	void colorconvert_yuyv_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				763	{
				764	ARM_COMPUTE_ERROR_ON(nullptr == input);
				765	ARM_COMPUTE_ERROR_ON(nullptr == output);
				766	win.validate();
				767
				768	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				769	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				770
				771	constexpr auto shift = yuyv ? 0 : 1;
				772
				773	// Destination's UV's width and height are subsampled
				774	Window win_uv(win);
				775	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				776	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				777	win_uv.validate();
				778
				779	Iterator in(input_ptr, win);
				780	Iterator out_y(output_ptr->plane(0), win);
				781	Iterator out_u(output_ptr->plane(1), win_uv);
				782	Iterator out_v(output_ptr->plane(2), win_uv);
				783
				784	execute_window_loop(win, [&](const Coordinates & id)
				785	{
				786	const auto ta_top = vld4q_u8(in.ptr());
				787	const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
				788	//ta.val[0] = Y0 Y2 Y4 Y6 ...
				789	//ta.val[1] = U0 U2 U4 U6 ...
				790	//ta.val[2] = Y1 Y3 Y5 Y7 ...
				791	//ta.val[3] = V0 V2 V4 V7 ...
				792
				793	uint8x16x2_t yvec;
				794	yvec.val[0] = ta_top.val[0 + shift];
				795	yvec.val[1] = ta_top.val[2 + shift];
				796	vst2q_u8(out_y.ptr(), yvec);
				797
				798	uint8x16x2_t yyvec;
				799	yyvec.val[0] = ta_bottom.val[0 + shift];
				800	yyvec.val[1] = ta_bottom.val[2 + shift];
				801	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
				802
				803	uint8x16_t uvec;
				804	uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
				805	vst1q_u8(out_u.ptr(), uvec);
				806
				807	uint8x16_t vvec;
				808	vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
				809	vst1q_u8(out_v.ptr(), vvec);
				810	},
				811	in, out_y, out_u, out_v);
				812	}
				813
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	814	/** Convert NV12 to YUV4.
				815	*
				816	* @param[in] input Input NV12 data buffer.
				817	* @param[out] output Output YUV4 buffer.
				818	* @param[in] win Window for iterating the buffers.
				819	*
				820	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	821	template <bool uv>
				822	void colorconvert_nv12_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				823	{
				824	ARM_COMPUTE_ERROR_ON(nullptr == input);
				825	ARM_COMPUTE_ERROR_ON(nullptr == output);
				826	win.validate();
				827
				828	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				829	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				830
				831	constexpr auto shift = uv ? 0 : 1;
				832
				833	// UV's width and height are subsampled
				834	Window win_uv(win);
				835	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				836	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				837	win_uv.validate();
				838
				839	Iterator in_y(input_ptr->plane(0), win);
				840	Iterator in_uv(input_ptr->plane(1), win_uv);
				841	Iterator out_y(output_ptr->plane(0), win);
				842	Iterator out_u(output_ptr->plane(1), win);
				843	Iterator out_v(output_ptr->plane(2), win);
				844
				845	execute_window_loop(win, [&](const Coordinates & id)
				846	{
				847	const auto ta_y_top = vld2q_u8(in_y.ptr());
				848	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				849	const auto ta_uv = vld2q_u8(in_uv.ptr());
				850	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				851	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				852	//ta_uv.val[0] = U0 U2 U4 U6 ...
				853	//ta_uv.val[1] = V0 V2 V4 V6 ...
				854
				855	vst2q_u8(out_y.ptr(), ta_y_top);
				856	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				857
				858	uint8x16x2_t uvec;
				859	uvec.val[0] = ta_uv.val[0 + shift];
				860	uvec.val[1] = ta_uv.val[0 + shift];
				861	vst2q_u8(out_u.ptr(), uvec);
				862	vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
				863
				864	uint8x16x2_t vvec;
				865	vvec.val[0] = ta_uv.val[1 - shift];
				866	vvec.val[1] = ta_uv.val[1 - shift];
				867	vst2q_u8(out_v.ptr(), vvec);
				868	vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
				869	},
				870	in_y, in_uv, out_y, out_u, out_v);
				871	}
				872
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	873	/** Convert IYUV to YUV4.
				874	*
				875	* @param[in] input Input IYUV data buffer.
				876	* @param[out] output Output YUV4 buffer.
				877	* @param[in] win Window for iterating the buffers.
				878	*
				879	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	880	void colorconvert_iyuv_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				881	{
				882	ARM_COMPUTE_ERROR_ON(nullptr == input);
				883	ARM_COMPUTE_ERROR_ON(nullptr == output);
				884	win.validate();
				885
				886	const auto input_ptr = static_cast<const IMultiImage *__restrict>(input);
				887	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				888
				889	// UV's width and height are subsampled
				890	Window win_uv(win);
				891	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				892	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				893	win_uv.validate();
				894
				895	Iterator in_y(input_ptr->plane(0), win);
				896	Iterator in_u(input_ptr->plane(1), win_uv);
				897	Iterator in_v(input_ptr->plane(2), win_uv);
				898	Iterator out_y(output_ptr->plane(0), win);
				899	Iterator out_u(output_ptr->plane(1), win);
				900	Iterator out_v(output_ptr->plane(2), win);
				901
				902	execute_window_loop(win, [&](const Coordinates & id)
				903	{
				904	const auto ta_y_top = vld2q_u8(in_y.ptr());
				905	const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
				906	const auto ta_u = vld1q_u8(in_u.ptr());
				907	const auto ta_v = vld1q_u8(in_v.ptr());
				908	//ta_y.val[0] = Y0 Y2 Y4 Y6 ...
				909	//ta_y.val[1] = Y1 Y3 Y5 Y7 ...
				910	//ta_u = U0 U2 U4 U6 ...
				911	//ta_v = V0 V2 V4 V6 ...
				912
				913	vst2q_u8(out_y.ptr(), ta_y_top);
				914	vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
				915
				916	uint8x16x2_t uvec;
				917	uvec.val[0] = ta_u;
				918	uvec.val[1] = ta_u;
				919	vst2q_u8(out_u.ptr(), uvec);
				920	vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
				921
				922	uint8x16x2_t vvec;
				923	vvec.val[0] = ta_v;
				924	vvec.val[1] = ta_v;
				925	vst2q_u8(out_v.ptr(), vvec);
				926	vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
				927	},
				928	in_y, in_u, in_v, out_y, out_u, out_v);
				929	}
				930
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	931	/** Convert RGB to NV12.
				932	*
				933	* @param[in] input Input RGB data buffer.
				934	* @param[out] output Output NV12 buffer.
				935	* @param[in] win Window for iterating the buffers.
				936	*
				937	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	938	template <bool alpha>
				939	void colorconvert_rgb_to_nv12(const void __restrict input, void __restrict output, const Window &win)
				940	{
				941	ARM_COMPUTE_ERROR_ON(nullptr == input);
				942	ARM_COMPUTE_ERROR_ON(nullptr == output);
				943	win.validate();
				944
				945	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				946	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				947
				948	// UV's width and height are subsampled
				949	Window win_uv(win);
				950	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				951	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				952	win_uv.validate();
				953
				954	Iterator in(input_ptr, win);
				955	Iterator out_y(output_ptr->plane(0), win);
				956	Iterator out_uv(output_ptr->plane(1), win_uv);
				957
				958	execute_window_loop(win, [&](const Coordinates & id)
				959	{
				960	const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
				961	const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
				962	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				963	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				964	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				965
				966	store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
				967	ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
				968	out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
				969	out_uv.ptr());
				970	},
				971	in, out_y, out_uv);
				972	}
				973
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	974	/** Convert RGB to IYUV.
				975	*
				976	* @param[in] input Input RGB data buffer.
				977	* @param[out] output Output IYUV buffer.
				978	* @param[in] win Window for iterating the buffers.
				979	*
				980	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	981	template <bool alpha>
				982	void colorconvert_rgb_to_iyuv(const void __restrict input, void __restrict output, const Window &win)
				983	{
				984	ARM_COMPUTE_ERROR_ON(nullptr == input);
				985	ARM_COMPUTE_ERROR_ON(nullptr == output);
				986	win.validate();
				987
				988	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				989	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				990
				991	// UV's width and height are subsampled
				992	Window win_uv(win);
				993	win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
				994	win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
				995	win_uv.validate();
				996
				997	Iterator in(input_ptr, win);
				998	Iterator out_y(output_ptr->plane(0), win);
				999	Iterator out_u(output_ptr->plane(1), win_uv);
				1000	Iterator out_v(output_ptr->plane(2), win_uv);
				1001
				1002	execute_window_loop(win, [&](const Coordinates & id)
				1003	{
				1004	const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
				1005	const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
				1006	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				1007	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				1008	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				1009
				1010	store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
				1011	ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
				1012	out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
				1013	out_u.ptr(), out_v.ptr());
				1014	},
				1015	in, out_y, out_u, out_v);
				1016	}
				1017
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame]	1018	/** Convert RGB to YUV4.
				1019	*
				1020	* @param[in] input Input RGB data buffer.
				1021	* @param[out] output Output YUV4 buffer.
				1022	* @param[in] win Window for iterating the buffers.
				1023	*
				1024	*/
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1025	template <bool alpha>
				1026	void colorconvert_rgb_to_yuv4(const void __restrict input, void __restrict output, const Window &win)
				1027	{
				1028	ARM_COMPUTE_ERROR_ON(nullptr == input);
				1029	ARM_COMPUTE_ERROR_ON(nullptr == output);
				1030	win.validate();
				1031
				1032	const auto input_ptr = static_cast<const IImage *__restrict>(input);
				1033	const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
				1034
				1035	Iterator in(input_ptr, win);
				1036	Iterator out_y(output_ptr->plane(0), win);
				1037	Iterator out_u(output_ptr->plane(1), win);
				1038	Iterator out_v(output_ptr->plane(2), win);
				1039
				1040	execute_window_loop(win, [&](const Coordinates & id)
				1041	{
				1042	const auto ta_rgb = load_rgb(in.ptr(), alpha);
				1043	//ta_rgb.val[0] = R0 R1 R2 R3 ...
				1044	//ta_rgb.val[1] = G0 G1 G2 G3 ...
				1045	//ta_rgb.val[2] = B0 B1 B2 B3 ...
				1046
				1047	store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
				1048	out_y.ptr(), out_u.ptr(), out_v.ptr());
				1049	},
				1050	in, out_y, out_u, out_v);
				1051	}
Gian Marco Iodice	356f643	2017-09-22 11:32:21 +0100	[diff] [blame]	1052	} // namespace arm_compute