Blame - src/core/CL/cl_kernels/concatenate.cl - ml/ComputeLibrary

blob: d2e65408dcf6161975f83f49fd7860375b8b403f [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	26	#if defined(VEC_SIZE)
				27	#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
				28
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	29	#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
				30	#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
Manuel Bottini	8481d83	2019-12-10 15:28:40 +0000	[diff] [blame]	31	#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	32	#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
				33	#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
Manuel Bottini	8481d83	2019-12-10 15:28:40 +0000	[diff] [blame]	34	inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	35	{
				36	const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
				37	const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
Manuel Bottini	8481d83	2019-12-10 15:28:40 +0000	[diff] [blame]	38	const VEC_QUANT res_q8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
				39	return res_q8;
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	40	}
				41	#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	42
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	43	#if defined(DATA_TYPE)
				44	#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	45
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	46	#if defined(DEPTH) && defined(ELEMENT_SIZE)
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	47	#if defined(INPUT1_WIDTH)
				48
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	49	#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
				50	#define SEQ VEC_OFFS(int, VEC_SIZE)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	51
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	52	/** This kernel concatenates two input tensors into the output tensor along the first dimension
				53	*
				54	* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
				55	* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	56	* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	57	* @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
				58	* @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
				59	*
Michele Di Giorgio	f6f7876	2020-07-06 11:27:21 +0100	[diff] [blame]	60	* @param[in] src1_ptr Pointer to the source tensor. Supported data types: All.
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	61	* @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
				62	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				63	* @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
				64	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				65	* @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
				66	* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				67	* @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
				68	* @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				69	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
				70	* @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
				71	* @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
				72	* @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				73	* @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
				74	* @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				75	* @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
				76	* @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				77	* @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
				78	* @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				79	* @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
				80	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
				81	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				82	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				83	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				84	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
				85	* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
				86	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				87	* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
				88	* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
				89	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				90	*/
				91	__kernel void concatenate_width_x2(
				92	TENSOR4D_DECLARATION(src1),
				93	TENSOR4D_DECLARATION(src2),
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	94	TENSOR4D_DECLARATION(dst))
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	95	{
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	96	// Calculate input indices
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	97	const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	98	const int y = get_global_id(1);
				99	const int z = get_global_id(2) % (int)DEPTH;
				100	const int w = get_global_id(2) / (int)DEPTH;
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	101	const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
				102	const int x2 = max(x - (int)INPUT1_WIDTH, 0);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	103
				104	// Calculate inputs and output addresses
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	105	const __global uchar dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
				106	const __global uchar src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
				107	const __global uchar src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	108
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	109	VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
				110	VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	111
Georgios Pinitas	6631ac2	2019-04-17 12:12:56 +0100	[diff] [blame]	112	#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	113	src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
				114	src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
Georgios Pinitas	6631ac2	2019-04-17 12:12:56 +0100	[diff] [blame]	115	#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	116	const VEC_INT x_coords = SEQ + (VEC_INT)(x);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	117
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	118	// Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	119	SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
				120	src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
				121	src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	122
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	123	cond = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
				124	const VEC_TYPE values0 = select(src2_values, src1_values, cond);
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	125
				126	STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	127	}
				128
				129	#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
				130	/** This kernel concatenates four input tensors into the output tensor along the first dimension
				131	*
				132	* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
				133	* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	134	* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	135	* @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
				136	* @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
				137	* @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
				138	* @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
				139	*
Michele Di Giorgio	f6f7876	2020-07-06 11:27:21 +0100	[diff] [blame]	140	* @param[in] src1_ptr Pointer to the source tensor. Supported data types: All
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	141	* @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
				142	* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				143	* @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
				144	* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				145	* @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
				146	* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				147	* @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
				148	* @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				149	* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
				150	* @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
				151	* @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
				152	* @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				153	* @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
				154	* @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				155	* @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
				156	* @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				157	* @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
				158	* @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				159	* @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
				160	* @param[in] src3_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
				161	* @param[in] src3_stride_x Stride of the source tensor in X dimension (in bytes)
				162	* @param[in] src3_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				163	* @param[in] src3_stride_y Stride of the source tensor in Y dimension (in bytes)
				164	* @param[in] src3_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				165	* @param[in] src3_stride_z Stride of the source tensor in Z dimension (in bytes)
				166	* @param[in] src3_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				167	* @param[in] src3_stride_w Stride of the first source tensor in Z dimension (in bytes)
				168	* @param[in] src3_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				169	* @param[in] src3_offset_first_element_in_bytes The offset of the first element in the source tensor
				170	* @param[in] src4_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
				171	* @param[in] src4_stride_x Stride of the source tensor in X dimension (in bytes)
				172	* @param[in] src4_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				173	* @param[in] src4_stride_y Stride of the source tensor in Y dimension (in bytes)
				174	* @param[in] src4_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				175	* @param[in] src4_stride_z Stride of the source tensor in Z dimension (in bytes)
				176	* @param[in] src4_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				177	* @param[in] src4_stride_w Stride of the first source tensor in Z dimension (in bytes)
				178	* @param[in] src4_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				179	* @param[in] src4_offset_first_element_in_bytes The offset of the first element in the source tensor
				180	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
				181	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				182	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				183	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				184	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
				185	* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
				186	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				187	* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
				188	* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
				189	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				190	*/
				191	__kernel void concatenate_width_x4(
				192	TENSOR4D_DECLARATION(src1),
				193	TENSOR4D_DECLARATION(src2),
				194	TENSOR4D_DECLARATION(src3),
				195	TENSOR4D_DECLARATION(src4),
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	196	TENSOR4D_DECLARATION(dst))
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	197	{
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	198	// Calculate input indices
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	199	const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	200	const int y = get_global_id(1);
				201	const int z = get_global_id(2) % (int)DEPTH;
				202	const int w = get_global_id(2) / (int)DEPTH;
				203
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	204	const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
				205	const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
				206	const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
				207	const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	208
				209	// Calculate inputs and output addresses
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	210	const __global uchar dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
				211	const __global uchar src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
				212	const __global uchar src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
				213	const __global uchar src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
				214	const __global uchar src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	215
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	216	VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
				217	VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
				218	VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
				219	VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	220
				221	#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
				222	src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
				223	src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
				224	src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
				225	src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
				226	#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	227
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	228	const VEC_INT x_coords = SEQ + (VEC_INT)(x);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	229
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	230	SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
				231	SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
				232	SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	233
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	234	// Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	235	src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
				236	src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	237	// Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	238	src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
				239	src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	240	// Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	241	src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
				242	src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	243
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	244	cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
				245	cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
				246	cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
				247
				248	VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
				249	values0 = select(src3_values, values0, cond_in3);
				250	values0 = select(src4_values, values0, cond_in4);
Sheri Zhang	7292362	2020-10-27 10:19:41 +0000	[diff] [blame]	251
				252	STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	253	}
				254	#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
				255	#endif /* defined(INPUT1_WIDTH) */
				256	#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
				257
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	258	#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	259	/** This kernel concatenates the input tensor into the output tensor along the first dimension
				260	*
Gian Marco Iodice	1d1f32c	2018-08-10 09:34:11 +0100	[diff] [blame]	261	* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	262	* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	263	* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Gian Marco Iodice	1d1f32c	2018-08-10 09:34:11 +0100	[diff] [blame]	264	* @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
Michele Di Giorgio	27400b9	2018-11-01 13:44:05 +0000	[diff] [blame]	265	* @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
Gian Marco Iodice	1d1f32c	2018-08-10 09:34:11 +0100	[diff] [blame]	266	*
				267	* @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	268	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				269	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				270	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				271	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				272	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				273	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	274	* @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
				275	* @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	276	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				277	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				278	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				279	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				280	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				281	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
				282	* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
				283	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	284	* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
				285	* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	286	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	287	*/
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	288
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	289	__kernel void concatenate_width(
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	290	TENSOR4D_DECLARATION(src),
				291	TENSOR4D_DECLARATION(dst))
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	292	{
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	293	// Calculate input indices
				294	const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
				295	const int y = get_global_id(1);
				296	const int z = get_global_id(2) % (int)DEPTH;
				297	const int w = get_global_id(2) / (int)DEPTH;
				298
				299	__global uchar src_addr = src_ptr + src_offset_first_element_in_bytes + x sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
				300	__global uchar dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	301
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	302	VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	303
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	304	#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	305	const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
				306	STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	307	#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	308	STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	309	#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	310	}
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	311
Sheri Zhang	1b50bd4	2020-10-27 00:24:07 +0000	[diff] [blame]	312	#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
Michalis Spyrou	55b3d12	2018-05-09 09:59:23 +0100	[diff] [blame]	313
Giorgio Arena	4112eed	2020-10-23 14:24:26 +0100	[diff] [blame]	314	#if defined(VEC_SIZE_LEFTOVER)
				315
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	316	#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
				317	/** This kernel concatenates the input tensor into the output tensor along the second dimension
				318	*
				319	* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
				320	* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
				321	* @note Vector sizes supported are 2,4,8 and 16.
				322	* @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
				323	* @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
Giorgio Arena	4112eed	2020-10-23 14:24:26 +0100	[diff] [blame]	324	* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	325	*
				326	* @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
				327	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				328	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				329	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				330	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				331	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				332	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				333	* @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
				334	* @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
				335	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				336	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				337	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				338	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				339	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				340	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
				341	* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
				342	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				343	* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
				344	* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
				345	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				346	*/
				347
				348	__kernel void concatenate_height(
				349	TENSOR4D_DECLARATION(src),
				350	TENSOR4D_DECLARATION(dst))
				351	{
Giorgio Arena	4112eed	2020-10-23 14:24:26 +0100	[diff] [blame]	352	const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
				353
				354	__global uchar src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
				355	2) / DEPTH) * src_stride_w;
				356	__global uchar dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
				357	2) / DEPTH) * dst_stride_w;
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	358
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	359	VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	360
				361	#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Giorgio Arena	4112eed	2020-10-23 14:24:26 +0100	[diff] [blame]	362	const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
				363	STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	364	#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Giorgio Arena	4112eed	2020-10-23 14:24:26 +0100	[diff] [blame]	365	STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Pablo Tello	6a14adb	2019-03-05 17:33:08 +0000	[diff] [blame]	366	#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
				367	}
				368
				369	#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
				370
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	371	/** This kernel concatenates the input tensor into the output tensor along the third dimension
				372	*
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	373	* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
				374	* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Giorgio Arena	5304884	2020-10-07 16:03:43 +0100	[diff] [blame]	375	* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgio	e6dbde0	2018-10-19 15:46:19 +0100	[diff] [blame]	376	*
Michele Di Giorgio	f6f7876	2020-07-06 11:27:21 +0100	[diff] [blame]	377	* @param[in] src_ptr Pointer to the source tensor. Supported data types: All
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	378	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				379	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				380	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				381	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas	ac4e873	2017-07-05 17:02:25 +0100	[diff] [blame]	382	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				383	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	384	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitas	ac4e873	2017-07-05 17:02:25 +0100	[diff] [blame]	385	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	386	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				387	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				388	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				389	* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas	ac4e873	2017-07-05 17:02:25 +0100	[diff] [blame]	390	* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
				391	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	392	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitas	ac4e873	2017-07-05 17:02:25 +0100	[diff] [blame]	393	* @param[in] offsets The offsets to the first valid element of the output tensor in bytes
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	394	*/
Vidhya Sudhan Loganathan	338595b	2019-06-28 14:09:53 +0100	[diff] [blame]	395	__kernel void concatenate(
Georgios Pinitas	ac4e873	2017-07-05 17:02:25 +0100	[diff] [blame]	396	TENSOR3D_DECLARATION(src),
				397	TENSOR3D_DECLARATION(dst),
Michalis Spyrou	a9c4472	2019-04-05 17:18:36 +0100	[diff] [blame]	398	int offset)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	399	{
Giorgio Arena	5304884	2020-10-07 16:03:43 +0100	[diff] [blame]	400	uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
				401
				402	__global uchar src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) src_stride_y + get_global_id(2) * src_stride_z;
				403	__global uchar dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) dst_stride_y + get_global_id(2) * dst_stride_z;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	404
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	405	VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	406
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	407	#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Giorgio Arena	5304884	2020-10-07 16:03:43 +0100	[diff] [blame]	408	source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
Pablo Tello	eb6c88a	2019-02-07 15:53:19 +0000	[diff] [blame]	409	#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
				410
Giorgio Arena	5304884	2020-10-07 16:03:43 +0100	[diff] [blame]	411	STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	412	}
Giorgio Arena	5304884	2020-10-07 16:03:43 +0100	[diff] [blame]	413	#endif /* defined(VEC_SIZE_LEFTOVER) */
Giorgio Arena	c90fcfe	2020-11-25 11:51:30 +0000	[diff] [blame]	414	#endif /* defined(DATA_TYPE) */
				415	#endif /* defined(VEC_SIZE) */