Blame - src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs - ml/ComputeLibrary

blob: a36bd438ff38e6b739c3c9861c364f8159028db2 [file] [log] [blame]

Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
				26
				27	#include "helpers.h"
				28
ASIAPAC\steli01	23ac91b	2017-11-07 16:14:44 +0800	[diff] [blame^]	29	#ifdef DATA_TYPE_FP32
				30
				31	precision highp float;
				32
				33	/** This kernel performs a direct convolution to convolve the low three dimensions
				34	*
				35	* @note This OpenGL ES shader works with stride_x = 1 and 2
				36	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
				37	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				38	*
				39	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
				40	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				41	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				42	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				43	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				44	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				45	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				46	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				47	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				48	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				49	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				50	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				51	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				52	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				53	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				54	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				55	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
				56	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				57	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				58	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				59	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				60	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				61	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				62	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				63	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				64	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				65	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				66	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				67	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				68	* @param[in] weights_depth The third dimensions of the weights tensors
				69	*/
				70
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	71	layout(std140) uniform shader_params
				72	{
				73	TENSOR3D_PARAM_DECLARATION(src);
				74	TENSOR3D_PARAM_DECLARATION(dst);
				75	TENSOR3D_PARAM_DECLARATION(weights);
				76	#ifdef BIAS
				77	VECTOR_PARAM_DECLARATION(biases);
				78	#endif /* BIAS */
				79	uint weights_stride_w;
				80	uint weights_depth;
				81	};
				82
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	83	BUFFER_DECLARATION(src, 1, float, readonly);
				84	BUFFER_DECLARATION(dst, 2, float, writeonly);
				85	BUFFER_DECLARATION(weights, 3, float, readonly);
				86	#ifdef BIAS
				87	BUFFER_DECLARATION(biases, 4, float, readonly);
				88	#endif /* BIAS */
				89
				90	#define LOAD20(r, name, offset) \
				91	r[0] = LOAD4(name, offset); \
				92	r[1] = LOAD4(name, offset + uint(1)); \
				93	r[2] = LOAD4(name, offset + uint(2)); \
				94	r[3] = LOAD4(name, offset + uint(3)); \
				95	r[4] = LOAD4(name, offset + uint(4))
				96
				97	/** This kernel performs a direct convolution to convolve the low three dimensions.
				98	*
				99	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
				100	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				101	*
				102	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
				103	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				104	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				105	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				106	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				107	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				108	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				109	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				110	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				111	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				112	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				113	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				114	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				115	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				116	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				117	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liang	f1f3ebd	2017-11-10 09:59:19 +0800	[diff] [blame]	118	* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	119	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				120	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				121	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				122	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				123	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				124	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				125	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				126	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				127	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				128	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				129	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				130	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				131	* @param[in] weights_depth The third dimensions of the weights tensors
				132	*/
				133	void main()
				134	{
				135	Image src = CONVERT_TO_IMAGE_STRUCT(src);
				136	Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				137	Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
				138
				139	#ifdef BIAS
				140	Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
				141	#endif /* BIAS */
				142
				143	float pixels = CONVERT(0, float);
				144	uint z_index = gl_GlobalInvocationID.z;
				145	weights.current_offset += z_index * weights_stride_w >> 2;
				146	float temp[5];
				147	float temp_weight[5];
				148
				149	for(int d = 0; d < int(weights_depth); ++d)
				150	{
				151	LOAD20(temp, src, offset(src, 0, 0));
				152	LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
				153	pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
				154
				155	LOAD20(temp, src, offset(src, 0, 1));
				156	LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
				157	pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
				158
				159	LOAD20(temp, src, offset(src, 0, 2));
				160	LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
				161	pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
				162
				163	LOAD20(temp, src, offset(src, 0, 3));
				164	LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
				165	pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
				166
				167	LOAD20(temp, src, offset(src, 0, 4));
				168	LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
				169	pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
				170
				171	src.current_offset += (src_stride_z >> 2);
				172	weights.current_offset += (weights_stride_z >> 2);
				173	}
				174
				175	#ifdef BIAS
				176	pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
				177	#endif /* BIAS */
				178
				179	STORE4(dst, CURRENT_OFFSET(dst), pixels);
				180	}
				181
				182	#elif defined(DATA_TYPE_FP16)
				183
				184	precision mediump float;
				185
ASIAPAC\steli01	23ac91b	2017-11-07 16:14:44 +0800	[diff] [blame^]	186	#if defined(PROCESS_4X_1Y_1Z)
				187
				188	/** This kernel performs a direct convolution to convolve the low three dimensions
				189	*
				190	* @note This OpenGL ES shader works with stride_x = 1 and 2
				191	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
				192	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				193	*
				194	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
				195	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				196	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				197	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				198	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				199	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				200	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				201	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				202	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				203	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				204	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				205	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				206	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				207	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				208	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				209	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				210	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
				211	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				212	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				213	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				214	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				215	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				216	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				217	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				218	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				219	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				220	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				221	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				222	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				223	* @param[in] weights_depth The third dimensions of the weights tensors
				224	*/
				225
				226	layout(std140) uniform shader_params
				227	{
				228	TENSOR3D_PARAM_DECLARATION(src);
				229	TENSOR3D_PARAM_DECLARATION(dst);
				230	TENSOR3D_PARAM_DECLARATION(weights);
				231	#ifdef BIAS
				232	VECTOR_PARAM_DECLARATION(biases);
				233	#endif /* BIAS */
				234	uint weights_stride_w;
				235	uint weights_depth;
				236	};
				237
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	238	BUFFER_DECLARATION(src, 1, uvec2, readonly);
				239	BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
				240	BUFFER_DECLARATION(weights, 3, uint, readonly);
				241	#ifdef BIAS
				242	BUFFER_DECLARATION(biases, 4, uint, readonly);
				243	#endif /* BIAS */
				244
				245	#if STRIDE_X == 1
				246	#define LOAD_SRC(src, row) load_src_stride1(src, row)
				247	#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
				248	#elif STRIDE_X == 2 /* STRIDE_X == 1 */
				249	#define LOAD_SRC(src, row) load_src_stride2(src, row)
				250	#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
				251	#else /* STRDIDE_X == 1 */
				252	#error STRIDE_X larger than 2 is not supported
				253	#endif /* STRIDE_X == 1 */
				254
				255	vec4[2] load_src_stride1(Image src, int row)
				256	{
				257	uvec2 packed[2];
				258	vec4 ret[2];
				259
				260	GC_LOAD2_2D_OFFSET(packed, src, 0, row);
				261
				262	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				263	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				264
				265	return ret;
				266	}
				267
				268	vec4[3] load_src_stride2(Image src, int row)
				269	{
				270	uvec2 packed[3];
				271	vec4 ret[3];
				272
				273	GC_LOAD3_2D_OFFSET(packed, src, 0, row);
				274
				275	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				276	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				277	ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
				278
				279	return ret;
				280	}
				281
				282	vec2[3] load_weight(Tensor3D weights, int row)
				283	{
				284	uvec3 packed_w;
				285	vec2 ret[3];
				286
				287	GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
				288
				289	ret[0] = vec2(unpackHalf2x16(packed_w[0]));
				290	ret[1] = vec2(unpackHalf2x16(packed_w[1]));
				291	ret[2] = vec2(unpackHalf2x16(packed_w[2]));
				292
				293	return ret;
				294	}
				295
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	296	vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
				297	{
				298	vec4 src0 = tmp[0];
				299	vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
				300	vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
				301	vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
				302	vec4 src4 = tmp[1];
				303	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				304
				305	return ret;
				306	}
				307
				308	vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
				309	{
				310	vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
				311	vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
				312	vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
				313	vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
				314	vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
				315	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				316
				317	return ret;
				318	}
				319
				320	/** This kernel performs a direct convolution to convolve the low three dimensions.
				321	*
				322	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
				323	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				324	*
				325	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
				326	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				327	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				328	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				329	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				330	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				331	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				332	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				333	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				334	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				335	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				336	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				337	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				338	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				339	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				340	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Joel Liang	f1f3ebd	2017-11-10 09:59:19 +0800	[diff] [blame]	341	* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	342	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				343	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				344	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				345	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				346	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				347	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				348	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				349	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				350	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				351	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				352	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				353	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				354	* @param[in] weights_depth The third dimensions of the weights tensors
				355	*/
				356	void main()
				357	{
				358	Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
				359	Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				360	Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
				361
				362	#ifdef BIAS
				363	Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
				364	#endif /* BIAS */
				365
				366	vec4 res = vec4(0);
				367	vec2 w[3];
				368	vec4 s[STRIDE_X + 1];
				369	uvec2 packed_d;
				370	uint z_index = gl_GlobalInvocationID.z;
				371
				372	weights.current_offset += z_index * weights_stride_w;
				373
				374	for(int d = 0; d < int(weights_depth); ++d)
				375	{
				376	for(int row = 0; row < 5; row++)
				377	{
				378	w = load_weight(weights, row);
				379	s = LOAD_SRC(src, row);
				380	res += CONVOLVE1x5(s, w);
				381	}
				382
				383	src.current_offset += src_stride_z;
				384	weights.current_offset += weights_stride_z;
				385	}
				386
				387	#ifdef BIAS
				388	uint packed_b;
				389	float b;
				390
				391	GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
				392	b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
				393	res += vec4(b);
				394	#endif /* BIAS */
				395
				396	packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
				397	GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
				398	}
				399
ASIAPAC\steli01	23ac91b	2017-11-07 16:14:44 +0800	[diff] [blame^]	400	#elif defined(PROCESS_4X_3Y_1Z)
				401
				402	/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
				403	*
				404	* @note This OpenGL ES shader works with stride_x = 1 and 2
				405	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
				406	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				407	*
				408	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
				409	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				410	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				411	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				412	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				413	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				414	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				415	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				416	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				417	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				418	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				419	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				420	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				421	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				422	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				423	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				424	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
				425	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				426	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				427	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				428	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				429	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				430	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				431	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				432	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				433	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				434	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				435	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				436	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				437	* @param[in] weights_depth The third dimensions of the weights tensors
				438	*/
				439
				440	layout(std140) uniform shader_params
				441	{
				442	TENSOR3D_PARAM_DECLARATION(src);
				443	TENSOR3D_PARAM_DECLARATION(dst);
				444	TENSOR3D_PARAM_DECLARATION(weights);
				445	#ifdef BIAS
				446	VECTOR_PARAM_DECLARATION(biases);
				447	#endif /* BIAS */
				448	uint weights_stride_w;
				449	uint weights_depth;
				450	};
				451
				452	BUFFER_DECLARATION(src, 1, uvec2, readonly);
				453	BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
				454	BUFFER_DECLARATION(weights, 3, uint, readonly);
				455	#ifdef BIAS
				456	BUFFER_DECLARATION(bias, 4, uint, readonly);
				457	#endif /* BIAS */
				458
				459	#if STRIDE_X == 1
				460	#define LOAD_SRC(src, row) load_src_stride1(src, row)
				461	#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
				462	#elif STRIDE_X == 2 /* STRIDE_X == 1 */
				463	#define LOAD_SRC(src, row) load_src_stride2(src, row)
				464	#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
				465	#else /* STRDIDE_X == 1 */
				466	#error STRIDE_X larger than 2 is not supported
				467	#endif /* STRIDE_X == 1 */
				468
				469	vec4[2] load_src_stride1(Image src, int row)
				470	{
				471	uvec2 packed[2];
				472	vec4 ret[2];
				473
				474	GC_LOAD2_2D_OFFSET(packed, src, 0, row);
				475
				476	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				477	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				478
				479	return ret;
				480	}
				481
				482	vec4[3] load_src_stride2(Image src, int row)
				483	{
				484	uvec2 packed[3];
				485	vec4 ret[3];
				486
				487	GC_LOAD3_2D_OFFSET(packed, src, 0, row);
				488
				489	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				490	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				491	ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
				492
				493	return ret;
				494	}
				495
				496	vec2[3] load_weight(Tensor3D weights, int row)
				497	{
				498	uvec3 packed_w;
				499	vec2 ret[3];
				500
				501	GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
				502
				503	ret[0] = vec2(unpackHalf2x16(packed_w[0]));
				504	ret[1] = vec2(unpackHalf2x16(packed_w[1]));
				505	ret[2] = vec2(unpackHalf2x16(packed_w[2]));
				506
				507	return ret;
				508	}
				509
				510	vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
				511	{
				512	vec4 src0 = tmp[0];
				513	vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
				514	vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
				515	vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
				516	vec4 src4 = tmp[1];
				517	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				518
				519	return ret;
				520	}
				521
				522	vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
				523	{
				524	vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
				525	vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
				526	vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
				527	vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
				528	vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
				529	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				530
				531	return ret;
				532	}
				533
				534	void main()
				535	{
				536	Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
				537	Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				538	Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
				539
				540	#ifdef BIAS
				541	Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
				542	#endif /* BIAS */
				543
				544	vec4 res[3];
				545	vec2 w[5][3];
				546	vec4 s[STRIDE_X + 1];
				547	uvec2 packed_d;
				548	uint z_index = gl_GlobalInvocationID.z;
				549	int i;
				550
				551	for(i = 0; i < 3; i++)
				552	{
				553	res[i] = vec4(0);
				554	}
				555
				556	weights.current_offset += z_index * weights_stride_w;
				557
				558	for(int d = 0; d < int(weights_depth); ++d)
				559	{
				560	// load weights once
				561	for(int row = 0; row < 5; row++)
				562	{
				563	w[row] = load_weight(weights, row);
				564	}
				565
				566	// 1st line
				567	s = LOAD_SRC(src, 0);
				568	res[0] += CONVOLVE1x5(s, w[0]);
				569
				570	// 2nd line
				571	s = LOAD_SRC(src, 1);
				572	res[0] += CONVOLVE1x5(s, w[1]);
				573	res[1] += CONVOLVE1x5(s, w[0]);
				574
				575	// 3rd line
				576	s = LOAD_SRC(src, 2);
				577	res[0] += CONVOLVE1x5(s, w[2]);
				578	res[1] += CONVOLVE1x5(s, w[1]);
				579	res[2] += CONVOLVE1x5(s, w[0]);
				580
				581	// 4th line
				582	s = LOAD_SRC(src, 3);
				583	res[0] += CONVOLVE1x5(s, w[3]);
				584	res[1] += CONVOLVE1x5(s, w[2]);
				585	res[2] += CONVOLVE1x5(s, w[1]);
				586
				587	// 5th line
				588	s = LOAD_SRC(src, 4);
				589	res[0] += CONVOLVE1x5(s, w[4]);
				590	res[1] += CONVOLVE1x5(s, w[3]);
				591	res[2] += CONVOLVE1x5(s, w[2]);
				592
				593	// 6th line
				594	s = LOAD_SRC(src, 5);
				595	res[1] += CONVOLVE1x5(s, w[4]);
				596	res[2] += CONVOLVE1x5(s, w[3]);
				597
				598	// 7th line
				599	s = LOAD_SRC(src, 6);
				600	res[2] += CONVOLVE1x5(s, w[4]);
				601
				602	src.current_offset += src_stride_z;
				603	weights.current_offset += weights_stride_z;
				604	}
				605
				606	#ifdef BIAS
				607	uint packed_b;
				608	float b;
				609
				610	GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
				611	b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
				612	for(i = 0; i < 3; i++)
				613	{
				614	res[i] += vec4(b);
				615	}
				616	#endif /* BIAS */
				617
				618	for(i = 0; i < 3; i++)
				619	{
				620	packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
				621	GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
				622	}
				623	}
				624
				625	#elif defined(PROCESS_4X_3Y_2Z)
				626
				627	/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
				628	*
				629	* @note This OpenGL ES shader works with stride_x = 1 and 2
				630	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
				631	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				632	*
				633	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
				634	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				635	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				636	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				637	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				638	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				639	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				640	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				641	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				642	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				643	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				644	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				645	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				646	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				647	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				648	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				649	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
				650	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				651	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				652	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				653	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				654	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				655	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				656	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				657	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				658	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				659	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				660	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				661	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				662	* @param[in] weights_depth The third dimensions of the weights tensors
				663	*/
				664
				665	layout(std140) uniform shader_params
				666	{
				667	TENSOR3D_PARAM_DECLARATION(src);
				668	TENSOR3D_PARAM_DECLARATION(dst);
				669	TENSOR3D_PARAM_DECLARATION(weights);
				670	#ifdef BIAS
				671	VECTOR_PARAM_DECLARATION(biases);
				672	#endif /* BIAS */
				673	uint weights_stride_w;
				674	uint weights_depth;
				675	};
				676
				677	BUFFER_DECLARATION(src, 1, uvec2, readonly);
				678	BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
				679	BUFFER_DECLARATION(weights, 3, uint, readonly);
				680	#ifdef BIAS
				681	BUFFER_DECLARATION(bias, 4, uint, readonly);
				682	#endif /* BIAS */
				683
				684	#if STRIDE_X == 1
				685	#define LOAD_SRC(src, row) load_src_stride1(src, row)
				686	#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
				687	#elif STRIDE_X == 2 /* STRIDE_X == 1 */
				688	#define LOAD_SRC(src, row) load_src_stride2(src, row)
				689	#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
				690	#else /* STRDIDE_X == 1 */
				691	#error STRIDE_X larger than 2 is not supported
				692	#endif /* STRIDE_X == 1 */
				693
				694	vec4[2] load_src_stride1(Image src, int row)
				695	{
				696	uvec2 packed[2];
				697	vec4 ret[2];
				698
				699	GC_LOAD2_2D_OFFSET(packed, src, 0, row);
				700
				701	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				702	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				703
				704	return ret;
				705	}
				706
				707	vec4[3] load_src_stride2(Image src, int row)
				708	{
				709	uvec2 packed[3];
				710	vec4 ret[3];
				711
				712	GC_LOAD3_2D_OFFSET(packed, src, 0, row);
				713
				714	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				715	ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				716	ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
				717
				718	return ret;
				719	}
				720
				721	vec2[3] load_weight(Tensor3D weights, int row)
				722	{
				723	uvec3 packed_w;
				724	vec2 ret[3];
				725
				726	GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
				727
				728	ret[0] = vec2(unpackHalf2x16(packed_w[0]));
				729	ret[1] = vec2(unpackHalf2x16(packed_w[1]));
				730	ret[2] = vec2(unpackHalf2x16(packed_w[2]));
				731
				732	return ret;
				733	}
				734
				735	vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
				736	{
				737	vec4 src0 = tmp[0];
				738	vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
				739	vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
				740	vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
				741	vec4 src4 = tmp[1];
				742	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				743
				744	return ret;
				745	}
				746
				747	vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
				748	{
				749	vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
				750	vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
				751	vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
				752	vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
				753	vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
				754	vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				755
				756	return ret;
				757	}
				758
				759	void main()
				760	{
				761	Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
				762	Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				763	Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
				764
				765	#ifdef BIAS
				766	Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
				767	#endif /* BIAS */
				768
				769	vec4 res[3];
				770	vec2 w[5][3];
				771	vec4 s[STRIDE_X + 1];
				772	uvec2 packed_d;
				773	uint z_index = (gl_GlobalInvocationID.z);
				774	uint s_offset = src.current_offset;
				775	int i, z;
				776
				777	weights.current_offset += z_index * weights_stride_w;
				778
				779	for(z = 0; z < 2; z++)
				780	{
				781	z_index += uint(z);
				782	src.current_offset = s_offset;
				783
				784	for(i = 0; i < 3; i++)
				785	{
				786	res[i] = vec4(0);
				787	}
				788
				789	for(int d = 0; d < int(weights_depth); ++d)
				790	{
				791	// load weights once
				792	for(int row = 0; row < 5; row++)
				793	{
				794	w[row] = load_weight(weights, row);
				795	}
				796
				797	// 1st line
				798	s = LOAD_SRC(src, 0);
				799	res[0] += CONVOLVE1x5(s, w[0]);
				800
				801	// 2nd line
				802	s = LOAD_SRC(src, 1);
				803	res[0] += CONVOLVE1x5(s, w[1]);
				804	res[1] += CONVOLVE1x5(s, w[0]);
				805
				806	// 3rd line
				807	s = LOAD_SRC(src, 2);
				808	res[0] += CONVOLVE1x5(s, w[2]);
				809	res[1] += CONVOLVE1x5(s, w[1]);
				810	res[2] += CONVOLVE1x5(s, w[0]);
				811
				812	// 4th line
				813	s = LOAD_SRC(src, 3);
				814	res[0] += CONVOLVE1x5(s, w[3]);
				815	res[1] += CONVOLVE1x5(s, w[2]);
				816	res[2] += CONVOLVE1x5(s, w[1]);
				817
				818	// 5th line
				819	s = LOAD_SRC(src, 4);
				820	res[0] += CONVOLVE1x5(s, w[4]);
				821	res[1] += CONVOLVE1x5(s, w[3]);
				822	res[2] += CONVOLVE1x5(s, w[2]);
				823
				824	// 6th line
				825	s = LOAD_SRC(src, 5);
				826	res[1] += CONVOLVE1x5(s, w[4]);
				827	res[2] += CONVOLVE1x5(s, w[3]);
				828
				829	// 7th line
				830	s = LOAD_SRC(src, 6);
				831	res[2] += CONVOLVE1x5(s, w[4]);
				832
				833	src.current_offset += src_stride_z;
				834	weights.current_offset += weights_stride_z;
				835	}
				836
				837	#ifdef BIAS
				838	uint packed_b;
				839	float b;
				840
				841	GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
				842	b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
				843	for(i = 0; i < 3; i++)
				844	{
				845	res[i] += vec4(b);
				846	}
				847	#endif /* BIAS */
				848
				849	for(i = 0; i < 3; i++)
				850	{
				851	packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
				852	GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
				853	}
				854
				855	dst.current_offset += dst_stride_z;
				856	}
				857	}
				858
				859	#elif defined(PROCESS_8X_1Y_1Z)
				860
				861	/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
				862	*
				863	* @note This OpenGL ES shader works with stride_x = 1
				864	* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
				865	* @note If biases are used then "define HAS_BIAS" has to be passed at compile time
				866	*
				867	* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
				868	* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
				869	* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
				870	* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
				871	* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
				872	* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
				873	* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
				874	* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
				875	* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
				876	* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
				877	* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
				878	* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
				879	* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
				880	* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
				881	* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
				882	* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
				883	* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
				884	* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
				885	* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
				886	* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
				887	* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
				888	* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
				889	* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
				890	* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
				891	* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
				892	* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
				893	* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
				894	* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
				895	* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
				896	* @param[in] weights_depth The third dimensions of the weights tensors
				897	*/
				898
				899	layout(std140) uniform shader_params
				900	{
				901	TENSOR3D_PARAM_DECLARATION(src);
				902	TENSOR3D_PARAM_DECLARATION(dst);
				903	TENSOR3D_PARAM_DECLARATION(weights);
				904	#ifdef BIAS
				905	VECTOR_PARAM_DECLARATION(biases);
				906	#endif /* BIAS */
				907	uint weights_stride_w;
				908	uint weights_depth;
				909	};
				910
				911	BUFFER_DECLARATION(src, 1, uvec4, readonly);
				912	BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
				913	BUFFER_DECLARATION(weights, 3, uint, readonly);
				914	#ifdef BIAS
				915	BUFFER_DECLARATION(bias, 4, uint, readonly);
				916	#endif /* BIAS */
				917
				918	#if STRIDE_X == 1
				919	#define LOAD_SRC(src, row) load_src_stride1(src, row)
				920	#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
				921	#elif STRIDE_X == 2 /* STRIDE_X == 1 */
				922	#error stride == 2 for PROCESS_8X_1Y not implemented
				923	#else /* STRDIDE_X == 1 */
				924	#error STRIDE_X larger than 2 is not supported
				925	#endif /* STRIDE_X == 1 */
				926
				927	vec4[3] load_src_stride1(Image src, int row)
				928	{
				929	uvec4 packed[2];
				930	vec4 ret[3];
				931
				932	GC_LOAD2_2D_OFFSET(packed, src, 0, row);
				933
				934	ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
				935	ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
				936	ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
				937
				938	return ret;
				939	}
				940
				941	vec2[3] load_weight(Tensor3D weights, int row)
				942	{
				943	uvec3 packed_w;
				944	vec2 ret[3];
				945
				946	GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
				947
				948	ret[0] = vec2(unpackHalf2x16(packed_w[0]));
				949	ret[1] = vec2(unpackHalf2x16(packed_w[1]));
				950	ret[2] = vec2(unpackHalf2x16(packed_w[2]));
				951
				952	return ret;
				953	}
				954
				955	vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
				956	{
				957	vec4 src0 = tmp[0];
				958	vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
				959	vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
				960	vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
				961	vec4 src4 = tmp[1];
				962	vec4 ret[2];
				963
				964	ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				965
				966	src0 = tmp[1];
				967	src1 = vec4(tmp[1].yzw, tmp[2].x);
				968	src2 = vec4(tmp[1].zw, tmp[2].xy);
				969	src3 = vec4(tmp[1].w, tmp[2].xyz);
				970	src4 = tmp[2];
				971	ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
				972
				973	return ret;
				974	}
				975
				976	void main()
				977	{
				978	Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
				979	Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
				980	Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
				981
				982	#ifdef BIAS
				983	Vector bias = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
				984	#endif /* BIAS */
				985
				986	vec4 res[2];
				987	vec2 w[3];
				988	vec4 s[STRIDE_X + 2];
				989	uvec4 packed_d;
				990	uint z_index = gl_GlobalInvocationID.z;
				991
				992	res[0] = vec4(0);
				993	res[1] = vec4(0);
				994	weights.current_offset += z_index * weights_stride_w;
				995
				996	for(int d = 0; d < int(weights_depth); ++d)
				997	{
				998	for(int row = 0; row < 5; row++)
				999	{
				1000	w = load_weight(weights, row);
				1001	s = LOAD_SRC(src, row);
				1002	res[0] += CONVOLVE1x5(s, w)[0];
				1003	res[1] += CONVOLVE1x5(s, w)[1];
				1004	}
				1005
				1006	src.current_offset += src_stride_z;
				1007	weights.current_offset += weights_stride_z;
				1008	}
				1009
				1010	#ifdef BIAS
				1011	uint packed_b;
				1012	float b;
				1013
				1014	GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
				1015	b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
				1016	res[0] += vec4(b);
				1017	res[1] += vec4(b);
				1018	#endif /* BIAS */
				1019
				1020	packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
				1021	packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
				1022	GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
				1023	}
				1024
				1025	#else /* defined(PROCESS_4X_1Y_1Z) */
				1026
				1027	#endif /* defined(PROCESS_4X_1Y_1Z) */
				1028
Anthony Barbier	7068f99	2017-10-26 15:23:08 +0100	[diff] [blame]	1029	#else /* DATA_TYPE_FP16 */
				1030	#error Data type not supported
				1031	#endif /* DATA_TYPE_FP16 */