Blame - src/core/CL/cl_kernels/direct_convolution1x1.cl - ml/ComputeLibrary

blob: ec0551b018151730fed9aafdbee8ed199ec12a12 [file] [log] [blame]

SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "helpers.h"
				25
				26	#if STRIDE_X == 3
				27	#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
				28	#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
				29	#elif STRIDE_X == 2
				30	#define INPUT_PIXEL(data_size) extract_input_stride2
				31	#elif STRIDE_X == 1
				32	#define INPUT_PIXEL(data_size) extract_input_stride1
				33	#else /* STRIDE_X not equals 1, 2 or 3 */
				34	#error "Only support strides 1, 2 and 3"
				35	#endif /* STRIDE_X == 3 */
				36
				37	/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
				38	*
				39	* @param[in] input_pixel Pointer to the first pixel.
				40	*
				41	* @return extracted input pixels.
				42	*/
				43	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
				44	{
				45	return vload8(0, input_pixel);
				46	}
				47
				48	/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
				49	*
				50	* @param[in] input_pixel Pointer to the first pixel.
				51	*
				52	* @return extracted input pixels.
				53	*/
				54	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
				55	{
				56	VEC_DATA_TYPE(DATA_TYPE, 16)
				57	temp = vload16(0, input_pixel);
				58	return temp.s02468ace;
				59	}
				60
				61	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
				62	*
				63	* @param[in] input_pixel Pointer to the first pixel.
				64	*
				65	* @return extracted input pixels.
				66	*/
				67	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
				68	{
				69	VEC_DATA_TYPE(DATA_TYPE, 4)
				70	temp1 = vload4(0, input_pixel);
				71	VEC_DATA_TYPE(DATA_TYPE, 4)
				72	temp2 = vload4(0, input_pixel + 6);
				73	VEC_DATA_TYPE(DATA_TYPE, 4)
				74	temp3 = vload4(0, input_pixel + 12);
				75	VEC_DATA_TYPE(DATA_TYPE, 4)
				76	temp4 = vload4(0, input_pixel + 18);
				77	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
				78	}
				79
				80	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
				81	*
				82	* @param[in] input_pixel Pointer to the first pixel.
				83	*
				84	* @return extracted input pixels.
				85	*/
				86	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
				87	{
				88	VEC_DATA_TYPE(DATA_TYPE, 8)
				89	temp1 = vload8(0, input_pixel);
				90	VEC_DATA_TYPE(DATA_TYPE, 8)
				91	temp2 = vload8(0, input_pixel + 8);
				92	VEC_DATA_TYPE(DATA_TYPE, 8)
				93	temp3 = vload8(0, input_pixel + 16);
				94	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
				95	}
				96
				97	/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
				98	*
				99	* @param[in] input_pixel Pointer to the first pixel.
				100	*
				101	* @return extracted input pixels.
				102	*/
				103	inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
				104	{
				105	VEC_DATA_TYPE(DATA_TYPE, 16)
				106	temp1 = vload16(0, input_pixel);
				107	VEC_DATA_TYPE(DATA_TYPE, 16)
				108	temp2 = vload16(0, input_pixel + 12);
				109	return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
				110	}
				111
				112	/** This kernel performs a direct convolution to convolve the low three dimensions.
				113	*
				114	* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
				115	* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	116	* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
				117	* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	118	* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
				119	*
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	120	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	121	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				122	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				123	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				124	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				125	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				126	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				127	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				128	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				129	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				130	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				131	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				132	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				133	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				134	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				135	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				136	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
				137	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				138	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				139	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				140	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				141	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				142	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				143	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				144	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				145	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				146	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				147	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	148	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	149	*/
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	150	#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	151	__kernel void direct_convolution1x1(
				152	TENSOR3D_DECLARATION(src),
				153	TENSOR3D_DECLARATION(dst),
				154	TENSOR3D_DECLARATION(weights),
				155	#ifdef HAS_BIAS
				156	VECTOR_DECLARATION(biases),
				157	#endif /* defined(HAS_BIAS) */
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	158	unsigned int weights_stride_w)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	159	{
				160	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				161	Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				162	Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
				163
				164	#ifdef HAS_BIAS
				165	Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
				166	#endif /* defined(HAS_BIAS) */
				167
				168	VEC_DATA_TYPE(DATA_TYPE, 8)
				169	pixels = 0;
				170
				171	const uint z_index = get_global_id(2);
				172
				173	weights.ptr += z_index * weights_stride_w;
				174
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	175	for(int d = 0; d < WEIGHTS_DEPTH; ++d)
SiCong Li	c51b72f	2017-07-28 14:46:20 +0100	[diff] [blame]	176	{
				177	DATA_TYPE weight = (__global DATA_TYPE )weights.ptr;
				178	VEC_DATA_TYPE(DATA_TYPE, 8)
				179	input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
				180	pixels += weight * input_pixel;
				181	src.ptr += src_stride_z;
				182	weights.ptr += weights_stride_z;
				183	}
				184
				185	#ifdef HAS_BIAS
				186	pixels += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index)));
				187	#endif /* defined(HAS_BIAS) */
				188
				189	vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
				190	}
Gian Marco Iodice	5cb4d6a	2017-08-08 10:53:00 +0100	[diff] [blame^]	191	#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)