Blame - src/core/CL/cl_kernels/direct_convolution1x1.cl - ml/ComputeLibrary

blob: 66c618e03346903f58f51facc83d6c9427342488 [file] [log] [blame]

SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
Michalis Spyrou	def665a	2017-08-14 11:26:37 +0100	[diff] [blame^]	26	#if defined(FIXED_POINT_POSITION)
				27	#include "fixed_point.h"
				28
				29	#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
				30	#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
				31
				32	// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
				33	MULQ_SAT_IMPL(qs32x8, qs32x8)
				34
				35	#else /* FIXED_POINT_POSITION */
				36
				37	#define ADD_OP(a, b) ((a) + (b))
				38	#define MUL_OP(a, b) ((a) * (b))
				39	#define CONVERT_SAT(a, b) ((a))
				40
				41	#endif /* FIXED_POINT_POSITION */
				42
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	43	#if STRIDE_X == 3
				44	#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
				45	#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
				46	#elif STRIDE_X == 2
				47	#define INPUT_PIXEL(data_size) extract_input_stride2
				48	#elif STRIDE_X == 1
				49	#define INPUT_PIXEL(data_size) extract_input_stride1
				50	#else /* STRIDE_X not equals 1, 2 or 3 */
				51	#error "Only support strides 1, 2 and 3"
				52	#endif /* STRIDE_X == 3 */
				53
				54	/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
				55	*
				56	* @param[in] input_pixel Pointer to the first pixel.
				57	*
				58	* @return extracted input pixels.
				59	*/
				60	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
				61	{
				62	return vload8(0, input_pixel);
				63	}
				64
				65	/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
				66	*
				67	* @param[in] input_pixel Pointer to the first pixel.
				68	*
				69	* @return extracted input pixels.
				70	*/
				71	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
				72	{
				73	VEC_DATA_TYPE(DATA_TYPE, 16)
				74	temp = vload16(0, input_pixel);
				75	return temp.s02468ace;
				76	}
				77
				78	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
				79	*
				80	* @param[in] input_pixel Pointer to the first pixel.
				81	*
				82	* @return extracted input pixels.
				83	*/
				84	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
				85	{
				86	VEC_DATA_TYPE(DATA_TYPE, 4)
				87	temp1 = vload4(0, input_pixel);
				88	VEC_DATA_TYPE(DATA_TYPE, 4)
				89	temp2 = vload4(0, input_pixel + 6);
				90	VEC_DATA_TYPE(DATA_TYPE, 4)
				91	temp3 = vload4(0, input_pixel + 12);
				92	VEC_DATA_TYPE(DATA_TYPE, 4)
				93	temp4 = vload4(0, input_pixel + 18);
				94	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
				95	}
				96
				97	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
				98	*
				99	* @param[in] input_pixel Pointer to the first pixel.
				100	*
				101	* @return extracted input pixels.
				102	*/
				103	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
				104	{
				105	VEC_DATA_TYPE(DATA_TYPE, 8)
				106	temp1 = vload8(0, input_pixel);
				107	VEC_DATA_TYPE(DATA_TYPE, 8)
				108	temp2 = vload8(0, input_pixel + 8);
				109	VEC_DATA_TYPE(DATA_TYPE, 8)
				110	temp3 = vload8(0, input_pixel + 16);
				111	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
				112	}
				113
				114	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
				115	*
				116	* @param[in] input_pixel Pointer to the first pixel.
				117	*
				118	* @return extracted input pixels.
				119	*/
				120	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
				121	{
				122	VEC_DATA_TYPE(DATA_TYPE, 16)
				123	temp1 = vload16(0, input_pixel);
				124	VEC_DATA_TYPE(DATA_TYPE, 16)
				125	temp2 = vload16(0, input_pixel + 12);
				126	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
				127	}
				128
				129	/** This kernel performs a direct convolution to convolve the low three dimensions.
				130	*
				131	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				132	* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	133	* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
				134	* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	135	* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
				136	*
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	137	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	138	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				139	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				140	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				141	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				142	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				143	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				144	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				145	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				146	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				147	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				148	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				149	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				150	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				151	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				152	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				153	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
				154	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				155	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				156	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				157	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				158	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				159	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				160	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				161	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				162	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				163	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				164	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	165	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	166	*/
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	167	#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	168	__kernel void direct_convolution1x1(
				169	TENSOR3D_DECLARATION(src),
				170	TENSOR3D_DECLARATION(dst),
				171	TENSOR3D_DECLARATION(weights),
				172	#ifdef HAS_BIAS
				173	VECTOR_DECLARATION(biases),
				174	#endif /* defined(HAS_BIAS) */
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	175	unsigned int weights_stride_w)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	176	{
				177	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				178	Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				179	Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
				180
				181	#ifdef HAS_BIAS
				182	Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
				183	#endif /* defined(HAS_BIAS) */
				184
Michalis Spyrou	def665a	2017-08-14 11:26:37 +0100	[diff] [blame^]	185	VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	186	pixels = 0;
				187
				188	const uint z_index = get_global_id(2);
				189
				190	weights.ptr += z_index * weights_stride_w;
				191
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	192	for(int d = 0; d < WEIGHTS_DEPTH; ++d)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	193	{
				194	DATA_TYPE weight = (__global DATA_TYPE )weights.ptr;
				195	VEC_DATA_TYPE(DATA_TYPE, 8)
				196	input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
Michalis Spyrou	def665a	2017-08-14 11:26:37 +0100	[diff] [blame^]	197	pixels = ADD_OP(pixels, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	198	src.ptr += src_stride_z;
				199	weights.ptr += weights_stride_z;
				200	}
				201
				202	#ifdef HAS_BIAS
Michalis Spyrou	def665a	2017-08-14 11:26:37 +0100	[diff] [blame^]	203	pixels = ADD_OP(pixels, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	204	#endif /* defined(HAS_BIAS) */
				205
Michalis Spyrou	def665a	2017-08-14 11:26:37 +0100	[diff] [blame^]	206	vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	207	}
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame]	208	#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)