Blame - src/core/CL/cl_kernels/direct_convolution3x3.cl - ml/ComputeLibrary

2017-08-08 10:53:00 +0100

[diff] [blame]

43

#if STRIDE_X == 1

44

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)

45

#elif STRIDE_X == 2 /* STRIDE_X == 1 */

46

#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

47

#else /* STRIDE_X not equals 1 or 2 */

48

#error "STRIDE_X larger than 2 is not supported"

49

#endif /* STRIDE_X == 2 */

50

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

51

#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \

52

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame^]

53

VEC_DATA_TYPE(DATA_TYPE, 3) \

54

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

55

VEC_DATA_TYPE(DATA_TYPE, 8) \

56

src0 = vload8(0, src_row_ptr); \

57

VEC_DATA_TYPE(DATA_TYPE, 2) \

58

src1 = vload2(0, src_row_ptr + 8); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

59

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

60

acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

61

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

62

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

63

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

64

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

65

#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \

66

({ \

steniu01

db00668

2017-08-09 16:26:22 +0100

[diff] [blame^]

67

VEC_DATA_TYPE(DATA_TYPE, 3) \

68

weights_values0 = vload3(0, weights_row_ptr); \

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

69

VEC_DATA_TYPE(DATA_TYPE, 16) \

70

src0 = vload16(0, src_row_ptr); \

71

DATA_TYPE src1 = *(src_row_ptr + 16); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

72

\

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

73

acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \

74

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \

75

acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

76

})

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

77

78

/** This kernel performs a direct convolution to convolve the low three dimensions.

79

*

80

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

81

* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

82

* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.

83

*

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

84

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

85

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

86

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

87

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

88

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

89

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

90

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

91

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

92

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

93

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

94

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

95

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

96

* @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)

97

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

98

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

99

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

100

* @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr

101

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

102

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

103

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

104

* @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)

105

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

106

* @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)

107

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

108

* @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr

109

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

110

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

111

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

112

* @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

113

*/

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

114

#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

115

__kernel void direct_convolution3x3(

116

TENSOR3D_DECLARATION(src),

117

TENSOR3D_DECLARATION(dst),

118

TENSOR3D_DECLARATION(weights),

119

#ifdef HAS_BIAS

120

VECTOR_DECLARATION(biases),

121

#endif /* defined(HAS_BIAS) */

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

122

unsigned int weights_stride_w)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

123

{

124

Image src = CONVERT_TO_IMAGE_STRUCT(src);

125

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

126

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

127

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

128

VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

129

pixels0 = 0;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

130

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

131

__global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);

132

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

133

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

134

const int kernel_index = get_global_id(2);

135

weights_addr += kernel_index * weights_stride_w;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

136

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

137

for(int d = 0; d < WEIGHTS_DEPTH; ++d)

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

138

{

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

139

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));

140

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));

141

CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

142

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

143

src_addr += src_stride_z;

144

weights_addr += weights_stride_z;

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

145

}

146

147

#ifdef HAS_BIAS

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

148

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

149

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

150

pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));

steniu01

2017-07-18 17:37:43 +0100

[diff] [blame]

151

#endif /* defined(HAS_BIAS) */

152

Michalis Spyrou

2017-08-14 11:26:37 +0100

[diff] [blame]

153

vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);

steniu01