COMPMID-631: Merge branches/gles_compute branch Last commit: commit b25c5f68042b0c81bf611d59a1bb8535e1c42497 Author: Xinghang Zhou <xinghang.zhou@arm.com> Date: Wed Oct 25 18:48:10 2017 +0800 Synced validation's tolerances of GCSoftmax from cl side Change-Id: Ibe72054205c1c8721845d679a31af7ed0a7c5cf6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93283 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>

commit: 7068f9900d136312318ff430aef588b14e0c87ad [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Thu Oct 26 15:23:08 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Nov 02 16:35:24 2018 +0000
tree: b57ca81231860f1d8755e6f18e5be7c959fb60c6
parent: d60737592736715dcfd0520535c48190d4ac77d2 [diff]
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
new file mode 100644
index 0000000..fd362f1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -0,0 +1,716 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <regex>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+GCProgram::GCProgram()
+    : _name(), _source()
+{
+}
+
+GCProgram::GCProgram(std::string name, std::string source)
+    : _name(std::move(name)), _source(std::move(source))
+{
+}
+
+GLuint GCProgram::link_program(GLuint shader)
+{
+    GLuint program = ARM_COMPUTE_GL_CHECK(glCreateProgram());
+
+    GLint   rvalue;
+    GLsizei length;
+
+    ARM_COMPUTE_GL_CHECK(glAttachShader(program, shader));
+    ARM_COMPUTE_GL_CHECK(glLinkProgram(program));
+    ARM_COMPUTE_GL_CHECK(glDetachShader(program, shader));
+    ARM_COMPUTE_GL_CHECK(glDeleteShader(shader));
+
+    // Check if there were some issues when linking the shader.
+    ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_LINK_STATUS, &rvalue));
+
+    if(rvalue == 0)
+    {
+        ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length));
+
+        std::vector<GLchar> log(length);
+        ARM_COMPUTE_GL_CHECK(glGetProgramInfoLog(program, length, nullptr, log.data()));
+        ARM_COMPUTE_ERROR("Error: Linker log:\n%s\n", log.data());
+
+        return 0;
+    }
+
+    ARM_COMPUTE_GL_CHECK(glUseProgram(program));
+
+    return program;
+}
+
+GLuint GCProgram::compile_shader(const std::string &build_options)
+{
+    GLuint shader = ARM_COMPUTE_GL_CHECK(glCreateShader(GL_COMPUTE_SHADER));
+
+    const char *src[]
+    {
+        "#version 310 es\n",
+        build_options.c_str(),
+        _source.c_str()
+    };
+
+    ARM_COMPUTE_GL_CHECK(glShaderSource(shader, sizeof(src) / sizeof(src[0]), src, nullptr));
+
+    ARM_COMPUTE_GL_CHECK(glCompileShader(shader));
+
+    // Check if there were any issues when compiling the shader
+    GLint   rvalue;
+    GLsizei length;
+
+    ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &rvalue));
+
+    if(rvalue == 0)
+    {
+        ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length));
+
+        std::vector<GLchar> log(length);
+        ARM_COMPUTE_GL_CHECK(glGetShaderInfoLog(shader, length, nullptr, log.data()));
+
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+        std::istringstream ss(_source);
+        std::stringstream  output_stream;
+        std::string        line;
+        size_t             line_num = 1;
+
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("GLES Shader build options:\n%s\n", build_options.c_str());
+        while(std::getline(ss, line, '\n'))
+        {
+            output_stream << std::setw(6) << line_num << ": " << line << std::endl;
+            line_num++;
+        }
+        ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:" << output_stream.rdbuf());
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
+        ARM_COMPUTE_ERROR("Error: Compiler log:\n%s\n", log.data());
+
+        return 0;
+    }
+
+    return shader;
+}
+
+GCKernel::GCKernel()
+    : _name(), _program(), _params(), _shader_params(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size()
+{
+}
+
+GCKernel::GCKernel(std::string name, GLuint program)
+    : _name(std::move(name)),
+      _program(program),
+      _params(),
+      _shader_params(0),
+      _shader_params_binding_point(0),
+      _shader_params_index(0),
+      _shader_params_size(0)
+{
+    _params.clear();
+
+    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params));
+
+    _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
+    ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name);
+}
+
+void GCKernel::cleanup()
+{
+    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
+    ARM_COMPUTE_GL_CHECK(glDeleteProgram(_program));
+    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
+}
+
+void GCKernel::use()
+{
+    ARM_COMPUTE_GL_CHECK(glUseProgram(_program));
+}
+
+void GCKernel::unuse()
+{
+    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
+}
+
+void GCKernel::update_shader_params()
+{
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size != (int)(_params.size() * sizeof(_params[0]))), "Params size (%d) is not equal to shader params block size (%d)", _params.size() * sizeof(_params[0]),
+                             _shader_params_size);
+
+    ARM_COMPUTE_GL_CHECK(glUniformBlockBinding(_program, _shader_params_index, _shader_params_binding_point));
+    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params));
+    ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _params.data(), GL_DYNAMIC_DRAW));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
+}
+
+const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
+{
+    { "absdiff", "absdiff.cs" },
+    { "col2im", "convolution_layer.cs" },
+    { "direct_convolution1x1", "direct_convolution1x1.cs" },
+    { "direct_convolution3x3", "direct_convolution3x3.cs" },
+    { "direct_convolution5x5", "direct_convolution5x5.cs" },
+    { "pooling_layer_2", "pooling_layer.cs" },
+    { "pooling_layer_3", "pooling_layer.cs" },
+    { "pooling_layer_7", "pooling_layer.cs" },
+    { "pooling_layer_3_optimized", "pooling_layer.cs" },
+    { "pooling_layer_n", "pooling_layer.cs" },
+    { "fill_image_borders_replicate", "fill_border.cs" },
+    { "fill_image_borders_constant", "fill_border.cs" },
+    { "gemm_accumulate_biases", "gemm.cs" },
+    { "gemm_interleave4x4", "gemm.cs" },
+    { "gemm_ma", "gemm.cs" },
+    { "gemm_mm_interleaved_transposed", "gemm.cs" },
+    { "gemm_mm_floating_point", "gemm.cs" },
+    { "gemm_transpose1x4", "gemm.cs" },
+    { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" },
+    { "im2col_generic", "convolution_layer.cs" },
+    { "im2col_reduced", "convolution_layer.cs" },
+    { "transpose", "transpose.cs" },
+    { "activation_layer", "activation_layer.cs" },
+    { "softmax_layer_max", "softmax_layer.cs" },
+    { "softmax_layer_shift_exp_sum", "softmax_layer.cs" },
+    { "softmax_layer_norm", "softmax_layer.cs" },
+    { "pixelwise_mul_float", "pixelwise_mul_float.cs" },
+    { "normalization_layer", "normalization_layer.cs" },
+    { "batchnormalization_layer", "batchnormalization_layer.cs" },
+    { "concatenate_depth", "concatenate.cs" },
+    { "dropout", "dropout.cs" },
+};
+
+const std::map<std::string, std::string> GCKernelLibrary::_program_source_map =
+{
+#ifdef EMBEDDED_KERNELS
+    {
+        "absdiff.cs",
+#include "./cs_shaders/absdiff.csembed"
+    },
+    {
+        "convolution_layer.cs",
+#include "./cs_shaders/convolution_layer.csembed"
+    },
+    {
+        "direct_convolution1x1.cs",
+#include "./cs_shaders/direct_convolution1x1.csembed"
+    },
+    {
+        "direct_convolution3x3.cs",
+#include "./cs_shaders/direct_convolution3x3.csembed"
+    },
+    {
+        "direct_convolution5x5.cs",
+#include "./cs_shaders/direct_convolution5x5.csembed"
+    },
+    {
+        "pooling_layer.cs",
+#include "./cs_shaders/pooling_layer.csembed"
+    },
+    {
+        "fill_border.cs",
+#include "./cs_shaders/fill_border.csembed"
+    },
+    {
+        "gemm.cs",
+#include "./cs_shaders/gemm.csembed"
+    },
+    {
+        "transpose.cs",
+#include "./cs_shaders/transpose.csembed"
+    },
+    {
+        "activation_layer.cs",
+#include "./cs_shaders/activation_layer.csembed"
+    },
+    {
+        "softmax_layer.cs",
+#include "./cs_shaders/softmax_layer.csembed"
+    },
+    {
+        "pixelwise_mul_float.cs",
+#include "./cs_shaders/pixelwise_mul_float.csembed"
+    },
+    {
+        "normalization_layer.cs",
+#include "./cs_shaders/normalization_layer.csembed"
+    },
+    {
+        "batchnormalization_layer.cs",
+#include "./cs_shaders/batchnormalization_layer.csembed"
+    },
+    {
+        "concatenate.cs",
+#include "./cs_shaders/concatenate.csembed"
+    },
+    {
+        "dropout.cs",
+#include "./cs_shaders/dropout.csembed"
+    },
+#endif /* EMBEDDED_KERNELS */
+};
+
+GCKernelLibrary::GCKernelLibrary()
+    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _own_context(false), _shader_path("./"), _programs_map(), _built_programs_map()
+{
+}
+
+GCKernelLibrary &GCKernelLibrary::get()
+{
+    static GCKernelLibrary _kernel_library;
+    return _kernel_library;
+}
+
+GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const StringSet &build_options_set) const
+{
+    // Find which program contains the kernel
+    auto shader_program_it = _shader_program_map.find(shader_name);
+
+    if(_shader_program_map.end() == shader_program_it)
+    {
+        ARM_COMPUTE_ERROR("Shader %s not found in the GCKernelLibrary", shader_name.c_str());
+    }
+
+    // Check if the program has been built before with same build options.
+    const std::string program_name       = shader_program_it->second;
+    const std::string build_options      = stringify_set(build_options_set);
+    const std::string built_program_name = program_name + "_" + build_options;
+    auto              built_program_it   = _built_programs_map.find(built_program_name);
+
+    GCKernel kernel;
+
+    if(_built_programs_map.end() != built_program_it)
+    {
+        // If program has been built, retrieve to create kernel from it
+        kernel = built_program_it->second;
+        kernel.use();
+    }
+    else
+    {
+        GCProgram program = load_program(program_name);
+
+        std::string source_name = _shader_path + shader_program_it->second;
+
+        // load shader
+        GLuint shader = program.compile_shader(build_options);
+
+        // Build program
+        GLuint gles_program = program.link_program(shader);
+
+        // Create GCKernel
+        kernel = GCKernel(shader_name, gles_program);
+
+        // Add built program to internal map
+        _built_programs_map.emplace(built_program_name, kernel);
+    }
+
+    return kernel;
+}
+
+const std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
+{
+    enum class ParserStage
+    {
+        FIRST,
+        SKIP_COMMENTS = FIRST,
+        RESOLVE_INCLUDES,
+        SKIP_PREPROCESSOR_DIRECTIVES,
+        SEARCH_MACRO_DEFINITIONS,
+        EXPAND_MACRO_USES,
+        LAST
+    };
+
+    struct MacroDefinitionInfo
+    {
+        const std::vector<std::string> param_list;
+        const std::string              content;
+    };
+
+    // Found macro definitions so far
+    std::map<const std::string, const MacroDefinitionInfo> macro_definitions;
+
+    // Define a GLES compute shader parser function
+    std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
+    cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string
+    {
+        std::string dst;
+
+        if(stage == ParserStage::LAST || std::regex_match(src, std::regex(R"(\s*)")))
+        {
+            return src;
+        }
+        auto next_stage = static_cast<ParserStage>(static_cast<int>(stage) + 1);
+
+        std::string search_pattern;
+        switch(stage)
+        {
+            case ParserStage::SKIP_COMMENTS:
+                search_pattern = R"((/\*([^*]|\n|(\*+([^*/]|\n)))*\*+/)|(//.*))";
+                break;
+            case ParserStage::RESOLVE_INCLUDES:
+                search_pattern = R"rgx((?:^|\n)[ \t]*#include "(.*)")rgx";
+                break;
+            case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
+                search_pattern = R"((^|\n)[ \t]*(#ifdef|#ifndef|#if)[^\n]+)";
+                break;
+            case ParserStage::SEARCH_MACRO_DEFINITIONS:
+                search_pattern = R"((?:^|\n)[ \t]*#define[ \t]+(\w+)(?:\((\w+(?:[ \t]*,[ \t]*\w+)*)\))?(?: |\t|\\\n)*((?:(?:[^\\\n]|\\[^\n])*\\+\n)*(?:[ \t]*[^ \t\n]+)*)[ \t]*)";
+                break;
+            case ParserStage::EXPAND_MACRO_USES:
+            {
+                if(macro_definitions.empty())
+                {
+                    // Nothing to expand
+                    return src;
+                }
+                int i = 0;
+                for(auto &def : macro_definitions)
+                {
+                    if(i == 0)
+                    {
+                        search_pattern = R"((\b)" + def.first;
+                    }
+                    else
+                    {
+                        search_pattern += R"(\b|\b)" + def.first;
+                    }
+                    i++;
+                }
+                search_pattern += R"(\b))";
+                break;
+            }
+            default:
+                break;
+        }
+
+        std::regex  search_regex(search_pattern);
+        std::smatch match;
+        ptrdiff_t   parsed_pos = 0;
+        if(std::regex_search(src, match, search_regex))
+        {
+            // Pass the content before the match to the next stage
+            dst.append(cs_parser(src.substr(0, match.position()), next_stage, 0));
+            parsed_pos = match.position() + match.length();
+
+            // Deal with the matched content
+            switch(stage)
+            {
+                case ParserStage::RESOLVE_INCLUDES:
+                {
+                    // Replace with the included file contents
+                    // And parse the content from the first stage
+                    const std::string source_name = _shader_path + match.str(1);
+                    dst.append(cs_parser(read_file(source_name, false), ParserStage::FIRST, 0));
+                    break;
+                }
+                case ParserStage::SEARCH_MACRO_DEFINITIONS:
+                {
+                    std::regex                     params_regex(R"(\b\w+\b)");
+                    const std::string              macro_param_str = match.str(2);
+                    const std::vector<std::string> macro_param_list(
+                        std::sregex_token_iterator(macro_param_str.begin(),
+                                                   macro_param_str.end(),
+                                                   params_regex),
+                        std::sregex_token_iterator());
+
+                    const MacroDefinitionInfo info =
+                    {
+                        macro_param_list,
+                        match.str(3)
+                    };
+                    // Collect the macro definition data and not change the shader source
+                    macro_definitions.insert(std::pair<const std::string, const MacroDefinitionInfo>(match.str(1), info));
+                    dst.append(match.str());
+                    break;
+                }
+                case ParserStage::EXPAND_MACRO_USES:
+                {
+                    ptrdiff_t                args_str_length = 0;
+                    std::vector<std::string> args_list;
+
+                    // Walk through argument list, because the regular expression does NOT support nested parentheses
+                    size_t cur_args_str_pos = match.position() + match.length();
+                    if(src[cur_args_str_pos++] == '(')
+                    {
+                        int       nested_parentheses = 0;
+                        ptrdiff_t cur_arg_pos        = cur_args_str_pos;
+                        ptrdiff_t cur_arg_length     = 0;
+
+                        args_str_length++;
+                        while(src[cur_args_str_pos] != ')' || nested_parentheses != 0)
+                        {
+                            switch(src[cur_args_str_pos++])
+                            {
+                                case '(':
+                                    nested_parentheses++;
+                                    cur_arg_length++;
+                                    break;
+                                case ',':
+                                    if(nested_parentheses == 0)
+                                    {
+                                        args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
+                                        cur_arg_pos    = cur_args_str_pos;
+                                        cur_arg_length = 0;
+                                    }
+                                    else
+                                    {
+                                        cur_arg_length++;
+                                    }
+                                    break;
+                                case ' ':
+                                case '\t':
+                                    if(cur_arg_length == 0)
+                                    {
+                                        cur_arg_pos++;
+                                    }
+                                    else
+                                    {
+                                        cur_arg_length++;
+                                    }
+                                    break;
+                                case ')':
+                                    nested_parentheses--;
+                                // no break here!
+                                default:
+                                    cur_arg_length++;
+                                    break;
+                            }
+                            args_str_length++;
+                        }
+                        if(src[cur_args_str_pos] == ')' && nested_parentheses == 0)
+                        {
+                            args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
+                        }
+                        args_str_length++;
+                    }
+
+                    std::string                    expanded_content = match.str();
+                    const std::vector<std::string> macro_param_list = macro_definitions.at(match.str()).param_list;
+
+                    if((nested_level != 0 || !macro_param_list.empty()) && macro_param_list.size() == args_list.size())
+                    {
+                        parsed_pos += args_str_length;
+                        expanded_content = macro_definitions.at(match.str()).content;
+                        size_t i         = 0;
+                        for(auto &param_name : macro_param_list)
+                        {
+                            std::regex params_regex(R"(\b)" + param_name + R"(\b)");
+                            expanded_content.assign(std::regex_replace(expanded_content, params_regex, args_list[i]));
+                            ++i;
+                        }
+                        // Expand macro recursively
+                        expanded_content = cs_parser(expanded_content, stage, nested_level + 1);
+
+                        if(nested_level == 0)
+                        {
+                            const std::regex token_pasting_rgx = std::regex(R"(\b##\b)");
+                            if(std::regex_search(expanded_content, token_pasting_rgx))
+                            {
+                                // Remove token pasting operator "##"
+                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(token_pasting_rgx), ""));
+                                // Trim trailing whitespace
+                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(R"([ \t]*\\\n)"), "\n"));
+                            }
+                            else
+                            {
+                                // Do not expand the macro if the result does not have token pasting operator "##"
+                                expanded_content = src.substr(match.position(), match.length() + args_str_length);
+                            }
+                        }
+                    }
+                    dst.append(expanded_content);
+                    break;
+                }
+                case ParserStage::SKIP_COMMENTS:
+                case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
+                default:
+                    dst.append(match.str());
+                    break;
+            }
+            next_stage = stage;
+        }
+        dst.append(cs_parser(src.substr(parsed_pos, src.length() - parsed_pos), next_stage, 0));
+
+        return dst;
+    };
+
+    return cs_parser(shader_source, ParserStage::FIRST, 0);
+}
+
+const GCProgram &GCKernelLibrary::load_program(const std::string &program_name) const
+{
+    const auto program_it = _programs_map.find(program_name);
+
+    if(program_it != _programs_map.end())
+    {
+        return program_it->second;
+    }
+
+    GCProgram program;
+
+#ifdef EMBEDDED_KERNELS
+    const auto program_source_it = _program_source_map.find(program_name);
+
+    if(_program_source_map.end() == program_source_it)
+    {
+        ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+
+    // TODO(APPBROWSER-298): Do not call shader preprocessor here
+    //       We should do the preprocess at compile time
+    //       The preprocess_shader function is used for support "#include" directive and token pasting operator "##".
+    //       This job could be done at compile time by using a python script in order to get better performance at runtime.
+    //       BTW: We usually defined EMBEDDED_KERNELS in release build.
+    program = GCProgram(program_name, preprocess_shader(program_source_it->second));
+#else  /* EMBEDDED_KERNELS */
+    // Check for binary
+    std::string source_name = _shader_path + program_name;
+    if(std::ifstream(source_name).is_open())
+    {
+        program = GCProgram(program_name, preprocess_shader(read_file(source_name, false)));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Shader file %s does not exist.", source_name.c_str());
+    }
+#endif /* EMBEDDED_KERNELS */
+
+    // Insert program to program map
+    const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+    return new_program.first->second;
+}
+
+void GCKernelLibrary::setup_context()
+{
+    EGLBoolean res;
+    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
+
+    res = eglInitialize(_display, nullptr, nullptr);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
+    ARM_COMPUTE_UNUSED(egl_extension_st);
+
+    const EGLint config_attribs[] =
+    {
+        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
+        EGL_NONE
+    };
+    EGLConfig cfg;
+    EGLint    count;
+
+    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglBindAPI(EGL_OPENGL_ES_API);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
+
+    const EGLint attribs[] =
+    {
+        EGL_CONTEXT_CLIENT_VERSION, 3,
+        EGL_NONE
+    };
+    _context = eglCreateContext(_display,
+                                cfg,
+                                EGL_NO_CONTEXT,
+                                attribs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+}
+
+void GCKernelLibrary::setup_dummy_fbo()
+{
+    ARM_COMPUTE_GL_CHECK(glGenFramebuffers(1, &_frame_buffer));
+    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, _frame_buffer));
+    ARM_COMPUTE_GL_CHECK(glGenTextures(1, &_tex_rt));
+    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, _tex_rt));
+    ARM_COMPUTE_GL_CHECK(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 1, 1, 0, GL_RGB, GL_UNSIGNED_BYTE, nullptr));
+    ARM_COMPUTE_GL_CHECK(glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _tex_rt, 0));
+}
+
+GCKernelLibrary::~GCKernelLibrary()
+{
+    for(auto &program : _built_programs_map)
+    {
+        static_cast<GCKernel>(program.second).cleanup();
+    }
+
+    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, 0));
+    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, 0));
+    ARM_COMPUTE_GL_CHECK(glDeleteTextures(1, &_tex_rt));
+    ARM_COMPUTE_GL_CHECK(glDeleteFramebuffers(1, &_frame_buffer));
+
+    if(_own_context)
+    {
+        eglDestroyContext(_display, _context);
+        eglTerminate(_display);
+
+        _context = EGL_NO_CONTEXT;
+        _display = EGL_NO_DISPLAY;
+    }
+}
+
+std::string GCKernelLibrary::stringify_set(const StringSet &s) const
+{
+    std::string concat_set;
+
+    // Concatenate set
+    for(const auto &el : s)
+    {
+        concat_set += el + "\n";
+    }
+
+    return concat_set;
+}

diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
new file mode 100644
index 0000000..154a2c0
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+#include <cstddef>
+#include <sstream>
+
+using namespace arm_compute;
+
+void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+
+    if(kernel.kernel().get_program() == 0)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+
+    ARM_COMPUTE_ERROR_ON_MSG((((window.x().end() - window.x().start()) % (window.x().step() * lws[0])) != 0),
+                             "window x end =%d, start=%d, step=%d, lws x=%d", window.x().end(), window.x().start(), window.x().step(), lws[0]);
+    ARM_COMPUTE_ERROR_ON_MSG((((window.y().end() - window.y().start()) % (window.y().step() * lws[1])) != 0),
+                             "window y end =%d, start=%d, step=%d, lws y=%d", window.y().end(), window.y().start(), window.y().step(), lws[1]);
+    ARM_COMPUTE_ERROR_ON_MSG((((window.z().end() - window.z().start()) % (window.z().step() * lws[2])) != 0),
+                             "window z end =%d, start=%d, step=%d, lws z=%d", window.z().end(), window.z().start(), window.z().step(), lws[2]);
+
+    ARM_COMPUTE_GL_CHECK(glDispatchCompute((window.x().end() - window.x().start()) / (window.x().step() / lws[0]),
+                                           (window.y().end() - window.y().start()) / (window.y().step() / lws[1]),
+                                           (window.z().end() - window.z().start()) / (window.z().step() / lws[2])));
+}
+
+IGCKernel::IGCKernel()
+    : _kernel()
+{
+}
+
+GCKernel &IGCKernel::kernel()
+{
+    return _kernel;
+}
+
+template <unsigned int dimension_size>
+unsigned int           IGCKernel::num_arguments_per_tensor() const
+{
+    return 2 + 2 * dimension_size;
+}
+
+template <unsigned int dimension_size>
+void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info    = tensor->info();
+    const Strides     &strides = info->strides_in_bytes();
+
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.set_params(idx++, strides[dimension]);
+        _kernel.set_params(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.set_params(idx++, offset_first_element);
+    _kernel.set_params(idx++, param.buffer_data_type_shift);
+
+    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, param.binding_point, tensor->gc_buffer()));
+
+    ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
+                             "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+
+void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, param, window);
+}
+
+void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, param, window);
+}
+
+void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, param, window);
+}
+
+unsigned int IGCKernel::num_arguments_per_1D_tensor() const
+{
+    return num_arguments_per_tensor<1>();
+}
+
+unsigned int IGCKernel::num_arguments_per_2D_tensor() const
+{
+    return num_arguments_per_tensor<2>();
+}
+
+unsigned int IGCKernel::num_arguments_per_3D_tensor() const
+{
+    return num_arguments_per_tensor<3>();
+}

diff --git a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
new file mode 100644
index 0000000..5bb479e
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void IGCSimple2DKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_2D_tensor_argument(idx, _output, 2, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
new file mode 100644
index 0000000..61225d8
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void IGCSimple3DKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
new file mode 100644
index 0000000..459601e
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+IGCSimpleKernel::IGCSimpleKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void IGCSimpleKernel::configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    IGCKernel::configure(win);
+}

diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
new file mode 100644
index 0000000..5576665
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+
+using namespace arm_compute;
+
+IGCTensor::IGCTensor()
+    : _mapping(nullptr)
+{
+}
+
+void IGCTensor::map(bool blocking)
+{
+    _mapping = do_map(blocking);
+}
+
+void IGCTensor::unmap()
+{
+    do_unmap();
+    _mapping = nullptr;
+}
+
+void IGCTensor::clear()
+{
+    this->map();
+    std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size());
+    this->unmap();
+}
+
+uint8_t *IGCTensor::buffer() const
+{
+    return _mapping;
+}

diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
new file mode 100644
index 0000000..fdfc085
--- /dev/null
+++ b/src/core/GLES_COMPUTE/OpenGLES.cpp

@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include <dlfcn.h>
+#include <iostream>
+#include <vector>
+
+using eglGetProcAddress_func         = __eglMustCastToProperFunctionPointerType EGLAPIENTRY (*)(const char *procname);
+using eglBindAPI_func                = EGLBoolean EGLAPIENTRY (*)(EGLenum api);
+using eglChooseConfig_func           = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+using eglCreateContext_func          = EGLContext EGLAPIENTRY (*)(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
+using eglDestroyContext_func         = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLContext ctx);
+using eglGetDisplay_func             = EGLDisplay EGLAPIENTRY (*)(EGLNativeDisplayType display_id);
+using eglInitialize_func             = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLint *major, EGLint *minor);
+using eglMakeCurrent_func            = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
+using eglTerminate_func              = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy);
+using eglGetError_func               = EGLint         EGLAPIENTRY (*)();
+using eglQueryString_func            = char const * EGLAPIENTRY (*)(EGLDisplay dpy, EGLint name);
+using glAttachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
+using glCompileShader_func           = void GL_APIENTRY (*)(GLuint shader);
+using glCreateProgram_func           = GLuint GL_APIENTRY (*)();
+using glCreateShader_func            = GLuint GL_APIENTRY (*)(GLenum type);
+using glDeleteProgram_func           = void GL_APIENTRY (*)(GLuint program);
+using glDeleteShader_func            = void GL_APIENTRY (*)(GLuint shader);
+using glDetachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
+using glGetProgramInfoLog_func       = void GL_APIENTRY (*)(GLuint program, GLsizei bufsize, GLsizei *length, GLchar *infolog);
+using glGetProgramiv_func            = void GL_APIENTRY (*)(GLuint program, GLenum pname, GLint *params);
+using glGetShaderInfoLog_func        = void GL_APIENTRY (*)(GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *infolog);
+using glGetShaderiv_func             = void GL_APIENTRY (*)(GLuint shader, GLenum pname, GLint *params);
+using glLinkProgram_func             = void GL_APIENTRY (*)(GLuint program);
+using glShaderSource_func            = void GL_APIENTRY (*)(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length);
+using glUseProgram_func              = void GL_APIENTRY (*)(GLuint program);
+using glBindBuffer_func              = void GL_APIENTRY (*)(GLenum target, GLuint buffer);
+using glBindBufferBase_func          = void GL_APIENTRY (*)(GLenum target, GLuint index, GLuint buffer);
+using glBufferData_func              = void GL_APIENTRY (*)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
+using glDeleteBuffers_func           = void GL_APIENTRY (*)(GLsizei n, const GLuint *buffers);
+using glDispatchCompute_func         = void GL_APIENTRY (*)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
+using glFlush_func                   = void      GL_APIENTRY (*)();
+using glGenBuffers_func              = void GL_APIENTRY (*)(GLsizei n, GLuint *buffers);
+using glGetProgramResourceIndex_func = GLuint GL_APIENTRY (*)(GLuint program, GLenum programInterface, const GLchar *name);
+using glGetUniformLocation_func      = GLint GL_APIENTRY (*)(GLuint program, const GLchar *name);
+using glMapBufferRange_func          = void *GL_APIENTRY (*)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+using glMemoryBarrier_func           = void GL_APIENTRY (*)(GLbitfield barriers);
+using glUniform1ui_func              = void GL_APIENTRY (*)(GLint location, GLuint v0);
+using glUnmapBuffer_func             = GLboolean GL_APIENTRY (*)(GLenum target);
+using glGetError_func                = GLenum              GL_APIENTRY (*)();
+using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
+using glUniformBlockBinding_func     = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+using glGetUniformBlockIndex_func    = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName);
+using glGenTextures_func             = void GL_APIENTRY (*)(GLsizei n, GLuint *textures);
+using glDeleteTextures_func          = void GL_APIENTRY (*)(GLsizei n, const GLuint *textures);
+using glBindTexture_func             = void GL_APIENTRY (*)(GLenum target, GLuint texture);
+using glTexImage2D_func              = void GL_APIENTRY (*)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type,
+                                                            const GLvoid *pixels);
+using glGenFramebuffers_func      = void GL_APIENTRY (*)(GLsizei n, GLuint *framebuffers);
+using glDeleteFramebuffers_func   = void GL_APIENTRY (*)(GLsizei n, const GLuint *framebuffers);
+using glBindFramebuffer_func      = void GL_APIENTRY (*)(GLenum target, GLuint framebuffer);
+using glFramebufferTexture2D_func = void GL_APIENTRY (*)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+
+class GLESSymbols
+{
+private:
+    void init()
+    {
+        void *egl_handle    = dlopen("libEGL.so", RTLD_LAZY | RTLD_LOCAL);
+        void *glesv2_handle = dlopen("libGLESv2.so", RTLD_LAZY | RTLD_LOCAL);
+        void *glesv3_handle = dlopen("libGLESv3.so", RTLD_LAZY | RTLD_LOCAL);
+        if(egl_handle == nullptr)
+        {
+            std::cerr << "Can't load libEGL.so: " << dlerror() << std::endl;
+        }
+        else
+        {
+#undef EGL_ENTRY
+#define EGL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(dlsym(egl_handle, #_api));
+#include "./egl_entries.in"
+#undef EGL_ENTRY
+
+            if(eglGetProcAddress != nullptr)
+            {
+#undef EGL_ENTRY
+#define EGL_ENTRY(_api)   \
+    if((_api) == nullptr) \
+        (_api) = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
+#include "./egl_entries.in"
+#undef EGL_ENTRY
+
+#undef GL_ENTRY
+#define GL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
+#include "./gl_entries.in"
+#undef GL_ENTRY
+            }
+
+            std::vector<void *> handles = { glesv3_handle, glesv2_handle };
+            for(auto &handle : handles)
+            {
+                if(handle != nullptr)
+                {
+#undef GL_ENTRY
+#define GL_ENTRY(_api)    \
+    if((_api) == nullptr) \
+        (_api) = reinterpret_cast<_api##_func>(dlsym(handle, #_api));
+#include "./gl_entries.in"
+#undef GL_ENTRY
+                }
+            }
+
+            if(glesv3_handle != nullptr)
+            {
+                dlclose(glesv3_handle);
+            }
+            if(glesv2_handle != nullptr)
+            {
+                dlclose(glesv2_handle);
+            }
+            dlclose(egl_handle);
+        }
+    }
+    bool _initialized = false;
+
+public:
+    static GLESSymbols &get()
+    {
+        static GLESSymbols symbols = GLESSymbols();
+        if(!symbols._initialized)
+        {
+            symbols._initialized = true;
+            symbols.init();
+        }
+
+        return symbols;
+    }
+
+#undef EGL_ENTRY
+#undef GL_ENTRY
+#define EGL_ENTRY(_api) _api##_func _api = nullptr;
+#define GL_ENTRY(_api) EGL_ENTRY(_api)
+#include "./egl_entries.in"
+#include "./gl_entries.in"
+#undef EGL_ENTRY
+#undef GL_ENTRY
+};
+
+bool arm_compute::opengles31_is_available()
+{
+    return GLESSymbols::get().glDispatchCompute != nullptr;
+}
+
+__eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress(const char *procname)
+{
+    auto func = GLESSymbols::get().eglGetProcAddress;
+    if(func != nullptr)
+    {
+        return func(procname);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api)
+{
+    auto func = GLESSymbols::get().eglBindAPI;
+    if(func != nullptr)
+    {
+        return func(api);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config)
+{
+    auto func = GLESSymbols::get().eglChooseConfig;
+    if(func != nullptr)
+    {
+        return func(dpy, attrib_list, configs, config_size, num_config);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list)
+{
+    auto func = GLESSymbols::get().eglCreateContext;
+    if(func != nullptr)
+    {
+        return func(dpy, config, share_context, attrib_list);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx)
+{
+    auto func = GLESSymbols::get().eglDestroyContext;
+    if(func != nullptr)
+    {
+        return func(dpy, ctx);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id)
+{
+    auto func = GLESSymbols::get().eglGetDisplay;
+    if(func != nullptr)
+    {
+        return func(display_id);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
+{
+    auto func = GLESSymbols::get().eglInitialize;
+    if(func != nullptr)
+    {
+        return func(dpy, major, minor);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx)
+{
+    auto func = GLESSymbols::get().eglMakeCurrent;
+    if(func != nullptr)
+    {
+        return func(dpy, draw, read, ctx);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy)
+{
+    auto func = GLESSymbols::get().eglTerminate;
+    if(func != nullptr)
+    {
+        return func(dpy);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLint EGLAPIENTRY eglGetError()
+{
+    auto func = GLESSymbols::get().eglGetError;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return GL_NO_ERROR;
+    }
+}
+
+char const *EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name)
+{
+    auto func = GLESSymbols::get().eglQueryString;
+    if(func != nullptr)
+    {
+        return func(dpy, name);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void GL_APIENTRY glAttachShader(GLuint program, GLuint shader)
+{
+    auto func = GLESSymbols::get().glAttachShader;
+    if(func != nullptr)
+    {
+        return func(program, shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glCompileShader(GLuint shader)
+{
+    auto func = GLESSymbols::get().glCompileShader;
+    if(func != nullptr)
+    {
+        return func(shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glCreateProgram()
+{
+    auto func = GLESSymbols::get().glCreateProgram;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+GLuint GL_APIENTRY glCreateShader(GLenum type)
+{
+    auto func = GLESSymbols::get().glCreateShader;
+    if(func != nullptr)
+    {
+        return func(type);
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void GL_APIENTRY glDeleteProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glDeleteProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteShader(GLuint shader)
+{
+    auto func = GLESSymbols::get().glDeleteShader;
+    if(func != nullptr)
+    {
+        return func(shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDetachShader(GLuint program, GLuint shader)
+{
+    auto func = GLESSymbols::get().glDetachShader;
+    if(func != nullptr)
+    {
+        return func(program, shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+{
+    auto func = GLESSymbols::get().glGetProgramInfoLog;
+    if(func != nullptr)
+    {
+        return func(program, bufSize, length, infoLog);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetProgramiv(GLuint program, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetProgramiv;
+    if(func != nullptr)
+    {
+        return func(program, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+{
+    auto func = GLESSymbols::get().glGetShaderInfoLog;
+    if(func != nullptr)
+    {
+        return func(shader, bufSize, length, infoLog);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetShaderiv(GLuint shader, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetShaderiv;
+    if(func != nullptr)
+    {
+        return func(shader, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glLinkProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glLinkProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glShaderSource(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length)
+{
+    auto func = GLESSymbols::get().glShaderSource;
+    if(func != nullptr)
+    {
+        return func(shader, count, string, length);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUseProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glUseProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindBuffer(GLenum target, GLuint buffer)
+{
+    auto func = GLESSymbols::get().glBindBuffer;
+    if(func != nullptr)
+    {
+        return func(target, buffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindBufferBase(GLenum target, GLuint index, GLuint buffer)
+{
+    auto func = GLESSymbols::get().glBindBufferBase;
+    if(func != nullptr)
+    {
+        return func(target, index, buffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage)
+{
+    auto func = GLESSymbols::get().glBufferData;
+    if(func != nullptr)
+    {
+        return func(target, size, data, usage);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteBuffers(GLsizei n, const GLuint *buffers)
+{
+    auto func = GLESSymbols::get().glDeleteBuffers;
+    if(func != nullptr)
+    {
+        return func(n, buffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDispatchCompute(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)
+{
+    auto func = GLESSymbols::get().glDispatchCompute;
+    if(func != nullptr)
+    {
+        return func(num_groups_x, num_groups_y, num_groups_z);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glFlush(void)
+{
+    auto func = GLESSymbols::get().glFlush;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGenBuffers(GLsizei n, GLuint *buffers)
+{
+    auto func = GLESSymbols::get().glGenBuffers;
+    if(func != nullptr)
+    {
+        return func(n, buffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glGetProgramResourceIndex(GLuint program, GLenum programInterface, const GLchar *name)
+{
+    auto func = GLESSymbols::get().glGetProgramResourceIndex;
+    if(func != nullptr)
+    {
+        return func(program, programInterface, name);
+    }
+    else
+    {
+        return GL_INVALID_INDEX;
+    }
+}
+
+GLint GL_APIENTRY glGetUniformLocation(GLuint program, const GLchar *name)
+{
+    auto func = GLESSymbols::get().glGetUniformLocation;
+    if(func != nullptr)
+    {
+        return func(program, name);
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+void *GL_APIENTRY glMapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access)
+{
+    auto func = GLESSymbols::get().glMapBufferRange;
+    if(func != nullptr)
+    {
+        return func(target, offset, length, access);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void GL_APIENTRY glMemoryBarrier(GLbitfield barriers)
+{
+    auto func = GLESSymbols::get().glMemoryBarrier;
+    if(func != nullptr)
+    {
+        return func(barriers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUniform1ui(GLint location, GLuint v0)
+{
+    auto func = GLESSymbols::get().glUniform1ui;
+    if(func != nullptr)
+    {
+        return func(location, v0);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLboolean GL_APIENTRY glUnmapBuffer(GLenum target)
+{
+    auto func = GLESSymbols::get().glUnmapBuffer;
+    if(func != nullptr)
+    {
+        return func(target);
+    }
+    else
+    {
+        return GL_FALSE;
+    }
+}
+
+GLenum GL_APIENTRY glGetError(void)
+{
+    auto func = GLESSymbols::get().glGetError;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return GL_NO_ERROR;
+    }
+}
+
+void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetActiveUniformBlockiv;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockIndex, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding)
+{
+    auto func = GLESSymbols::get().glUniformBlockBinding;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockIndex, uniformBlockBinding);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glGetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName)
+{
+    auto func = GLESSymbols::get().glGetUniformBlockIndex;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockName);
+    }
+    else
+    {
+        return GL_INVALID_INDEX;
+    }
+}
+
+void GL_APIENTRY glGenTextures(GLsizei n, GLuint *textures)
+{
+    auto func = GLESSymbols::get().glGenTextures;
+    if(func != nullptr)
+    {
+        return func(n, textures);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteTextures(GLsizei n, const GLuint *textures)
+{
+    auto func = GLESSymbols::get().glDeleteTextures;
+    if(func != nullptr)
+    {
+        return func(n, textures);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindTexture(GLenum target, GLuint texture)
+{
+    auto func = GLESSymbols::get().glBindTexture;
+    if(func != nullptr)
+    {
+        return func(target, texture);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels)
+{
+    auto func = GLESSymbols::get().glTexImage2D;
+    if(func != nullptr)
+    {
+        return func(target, level, internalformat, width, height, border, format, type, pixels);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGenFramebuffers(GLsizei n, GLuint *framebuffers)
+{
+    auto func = GLESSymbols::get().glGenFramebuffers;
+    if(func != nullptr)
+    {
+        return func(n, framebuffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
+{
+    auto func = GLESSymbols::get().glDeleteFramebuffers;
+    if(func != nullptr)
+    {
+        return func(n, framebuffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindFramebuffer(GLenum target, GLuint framebuffer)
+{
+    auto func = GLESSymbols::get().glBindFramebuffer;
+    if(func != nullptr)
+    {
+        return func(target, framebuffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glFramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level)
+{
+    auto func = GLESSymbols::get().glFramebufferTexture2D;
+    if(func != nullptr)
+    {
+        return func(target, attachment, textarget, texture, level);
+    }
+    else
+    {
+        return;
+    }
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
new file mode 100644
index 0000000..f6113e1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(src2);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, uint, readonly);
+BUFFER_DECLARATION(src2, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @param[in]  src1_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  src1_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  src2_ptr                           Pointer to the second source image. Supported data types: Same as @p in1_ptr
+ * @param[in]  src2_stride_x                      Stride of the second source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the second source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: Same as @p in1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ */
+void main(void)
+{
+    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2 = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uvec4 tmp1 = UNPACK(LOAD4(src1, CURRENT_OFFSET(src1)), uint, uvec4);
+    uvec4 tmp2 = UNPACK(LOAD4(src2, CURRENT_OFFSET(src2)), uint, uvec4);
+    uvec4 diff = uvec4(abs(ivec4(tmp1 - tmp2)));
+
+    STORE4(dst, CURRENT_OFFSET(dst), PACK(diff, uvec4, uint));
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
new file mode 100644
index 0000000..fc9da11
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs

@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else  /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+    float tmp = float(B_VAL) * x;
+    if(tmp > 10.f)
+    {
+        return MUL_OP(float(A_VAL), 1.f);
+    }
+    else if(tmp < -10.f)
+    {
+        return MUL_OP(float(A_VAL), -1.f);
+    }
+    else
+    {
+        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+    }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+    return max(0.f, x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+    return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+    return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+    return MLA_OP(float(B_VAL), float(A_VAL), x);
+}
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#ifdef DATA_TYPE_FP32
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_ptr                              Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float data     = src_ptr[src.current_offset];
+    float data_out = 0.f;
+    // Perform activation
+
+#ifdef LOGISTIC
+    data_out = logistic_op(data);
+#elif defined(TANH)     /*LOGISTIC*/
+    data_out = tanh_op(data);
+#elif defined(RELU)     /*RELU*/
+    data_out = relu_op(data);
+#elif defined(BRELU)    /*BRELU*/
+    data_out = brelu_op(data);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+    data_out = lu_brelu_op(data);
+#elif defined(LRELU)    /*LRELU*/
+    data_out = lrelu_op(data);
+#elif defined(SRELU)    /*SRELU*/
+    data_out = srelu_op(data);
+#elif defined(ABS)      /*ABS*/
+    data_out = abs_op(data);
+#elif defined(SQUARE)   /*SQUARE*/
+    data_out = square_op(data);
+#elif defined(SQRT)     /*SQRT*/
+    data_out = sqrt_op(data);
+#elif defined(LINEAR)   /*LINEAR*/
+    data_out = linear_op(data);
+#else                   /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+    dst_ptr[dst.current_offset] = data_out;
+}
+
+#elif defined(DATA_TYPE_FP16)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_ptr                              Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    uint data = src_ptr[src.current_offset >> 2];
+    // Perform activation
+    float a = unpackHalf2x16(data).x;
+    float b = unpackHalf2x16(data).y;
+    vec2  data_out;
+#ifdef LOGISTIC         /*LOGISTIC*/
+    data_out.x = logistic_op(a);
+    data_out.y = logistic_op(b);
+#elif defined(TANH)     /*TANH*/
+    data_out.x = tanh_op(a);
+    data_out.y = tanh_op(b);
+#elif defined(RELU)     /*RELU*/
+    data_out.x = relu_op(a);
+    data_out.y = relu_op(b);
+#elif defined(BRELU)    /*BRELU*/
+    data_out.x = brelu_op(a);
+    data_out.y = brelu_op(b);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+    data_out.x = lu_brelu_op(a);
+    data_out.y = lu_brelu_op(b);
+#elif defined(LRELU)    /*LRELU*/
+    data_out.x = lrelu_op(a);
+    data_out.y = lrelu_op(b);
+#elif defined(SRELU)    /*SRELU*/
+    data_out.x = srelu_op(a);
+    data_out.y = srelu_op(b);
+#elif defined(ABS)      /*ABS*/
+    data_out.x = abs_op(a);
+    data_out.y = abs_op(b);
+#elif defined(SQUARE)   /*SQUARE*/
+    data_out.x = square_op(a);
+    data_out.y = square_op(b);
+#elif defined(SQRT)     /*SQRT*/
+    data_out.x = sqrt_op(a);
+    data_out.y = sqrt_op(b);
+#elif defined(LINEAR)   /*LINEAR*/
+    data_out.x = linear_op(a);
+    data_out.y = linear_op(b);
+#else                   /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+    dst_ptr[dst.current_offset >> 2] = packHalf2x16(data_out);
+}
+#endif /*DATA_TYPE_FP32*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
new file mode 100644
index 0000000..5488092
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs

@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif /*DATA_TYPE_FP32*/
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) inversesqrt((a))
+#define SQCVT_SAT(a) (a)
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    VECTOR_PARAM_DECLARATION(mean);
+    VECTOR_PARAM_DECLARATION(var);
+    VECTOR_PARAM_DECLARATION(beta);
+    VECTOR_PARAM_DECLARATION(gamma);
+};
+
+#ifdef DATA_TYPE_FP32
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(mean, 3, float, readonly);
+BUFFER_DECLARATION(var, 4, float, readonly);
+BUFFER_DECLARATION(beta, 5, float, readonly);
+BUFFER_DECLARATION(gamma, 6, float, readonly);
+
+/** Apply batch normalization.
+ *
+ * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ *
+ * @param[in]  src_ptr                              Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                         Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ */
+void main(void)
+{
+    Tensor3D src   = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst   = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+    float input_value = 0.f;
+    float denominator = 0.f;
+    float numerator   = 0.f;
+    float x_bar       = 0.f;
+    float gamma_param = 0.f;
+    float beta_param  = 0.f;
+
+    uint current_slice = gl_GlobalInvocationID.z;
+
+    input_value = src_ptr[src.current_offset];
+    denominator = var_ptr[var.current_offset + (current_slice * var.stride_x) >> 2];
+    denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+    // Calculate x bar and store results
+    numerator = mean_ptr[mean.current_offset + (current_slice * mean.stride_x) >> 2];
+    numerator = SUB_OP(input_value, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+    gamma_param = gamma_ptr[gamma.current_offset + (current_slice * beta.stride_x) >> 2];
+    beta_param  = beta_ptr[beta.current_offset + (current_slice * beta.stride_x) >> 2];
+
+    dst_ptr[dst.current_offset] = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+}
+
+#elif defined(DATA_TYPE_FP16)
+BUFFER_DECLARATION(src, 1, uint, );
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+BUFFER_DECLARATION(mean, 3, uint, );
+BUFFER_DECLARATION(var, 4, uint, );
+BUFFER_DECLARATION(beta, 5, uint, );
+BUFFER_DECLARATION(gamma, 6, uint, );
+
+/** Apply batch normalization.
+ *
+ * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ *
+ * @param[in]  src_ptr                              Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                         Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ */
+void main(void)
+{
+    Tensor3D src   = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst   = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT_FP16(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT_FP16(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT_FP16(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma);
+
+    vec2  input_value;
+    float denominator;
+    float numerator;
+    vec2  x_bar;
+    float gamma_param;
+    float beta_param;
+
+    uint current_slice = gl_GlobalInvocationID.z;
+    if((current_slice % uint(2)) == uint(0))
+    {
+        input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
+        denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x;
+        x_bar     = MUL_OP(SUB_OP(input_value, numerator), denominator);
+
+        gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x;
+        beta_param  = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x;
+
+        dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+    }
+    else
+    {
+        input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
+        denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y;
+        x_bar     = MUL_OP(SUB_OP(input_value, numerator), denominator);
+
+        gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y;
+        beta_param  = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y;
+
+        dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+    }
+}
+#endif /*DATA_TYPE_FP32*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
new file mode 100644
index 0000000..65000f2
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    dst_ptr[dst.current_offset + uint(OFFSETS_Z >> 2)] = src_ptr[tensor3D_offset(src, -OFFSETS_X, -OFFSETS_Y, 0)];
+}
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    uvec2 packed_s;
+    GC_LOAD1_3D_OFFSET(packed_s, src, -OFFSETS_X, -OFFSETS_Y, 0);
+    dst_ptr[(dst.current_offset + uint(OFFSETS_Z)) >> 3] = packed_s;
+}
+#endif /*DATA_TYPE_FP32*/
\ No newline at end of file

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
new file mode 100644
index 0000000..1a0c9f1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP16
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#else  // DATA_TYPE_FP16
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+#endif // DATA_TYPE_FP16
+
+layout(std140) uniform shader_params
+{
+#ifdef IM2COL_GENERIC
+    TENSOR3D_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+    uint filter_depth;
+    uint src_stride_w;
+    uint dst_stride_w;
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+    TENSOR3D_PARAM_DECLARATION(src);
+    VECTOR_PARAM_DECLARATION(dst);
+    uint width;
+    uint height;
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+    IMAGE_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    uint width;
+#endif // COL2IM
+};
+
+#ifdef DATA_TYPE_FP16
+
+precision mediump float;
+
+#ifdef IM2COL_REDUCED
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D src_nostep     = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+    uint     image_size     = width * height;
+    uint     element_count  = src_step_x / src_stride_x;
+    uint     tmp_out_offset = dst.current_offset + ((pos.x * element_count + pos.y * width + pos.z * image_size) * dst.stride_x);
+    uint     width_fp16     = ((width + uint(1)) >> uint(1));
+    uint     tmp;
+
+    // odd width
+    if(width % uint(2) != uint(0))
+    {
+        // even row
+        if((pos.y + pos.z * height) % uint(2) == uint(0))
+        {
+            LOAD1(tmp, src, src.current_offset >> uint(2));
+            STORE1(dst, tmp_out_offset >> uint(2), tmp);
+        }
+        else
+        {
+            // special op
+            uint tmpleft  = uint(0);
+            uint tmpright = uint(0);
+            LOAD1(tmpright, src, src.current_offset >> uint(2)); // right half
+            if(pos.x == uint(0))
+            {
+                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, int(width), int(pos.y) - 1, int(pos.z)) >> uint(2)); // left half
+                tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16));
+            }
+            else
+            {
+                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)) >> uint(2)); // left half
+                tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16)));
+            }
+            STORE1(dst, tmp_out_offset >> uint(2), tmpright);
+        }
+    }
+    else
+    {
+        LOAD1(tmp, src, src.current_offset >> uint(2));
+        STORE1(dst, tmp_out_offset >> uint(2), tmp);
+    }
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+    {
+        tmp_out_offset += dst.stride_x;
+
+        // FIXME: need odd/even detection for tmp_out_offset?
+        mediump vec2 bias_vec = vec2(1.0f, 1.0f);
+        uint         bias_u   = packHalf2x16(bias_vec);
+        STORE1(dst, tmp_out_offset >> uint(2), bias_u);
+    }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_REDUCED
+
+#elif defined(DATA_TYPE_FP32)
+
+#ifdef IM2COL_GENERIC
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  filter_depth                      The depth of the used filter
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+void main(void)
+{
+    uint xc    = gl_GlobalInvocationID.x;                // x coordinate in the convolved tensor
+    uint yc    = gl_GlobalInvocationID.y;                // y coordinate in the convolved tensor
+    uint ch    = gl_GlobalInvocationID.z % filter_depth; // input feature map
+    uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch
+
+    // Calculate input indeces
+    uint xi           = xc * uint(STRIDE_X) - uint(PAD_X);
+    uint yi           = yc * uint(STRIDE_Y) - uint(PAD_Y);
+    uint input_offset = (src_offset_first_element_in_bytes + (ch * src_stride_z) + (batch * src_stride_w)) >> uint(2);
+
+    // Calculate output indeces
+    uint xo            = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
+    uint yo            = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
+    uint output_offset = (dst_offset_first_element_in_bytes + (yo * dst_stride_y) + (batch * dst_stride_w) + xo) >> uint(2);
+
+    // Linearize convolution elements
+    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    {
+        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x)
+        {
+#if PAD_X == 0 && PAD_Y == 0
+            output_offset = input_offset + ((x * src_stride_x + y * src_stride_y) >> uint(2));
+            STORE4(dst, output_offset, LOAD4(src, input_offset));
+#else  // PAD_X == 0 && PAD_Y == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                STORE4(dst, output_offset, 0.0f);
+            }
+            else
+            {
+                output_offset = input_offset + (x * src_stride_x + y * src_stride_y) >> uint(2));
+                STORE4(dst, output_offset, LOAD4(src, input_offset));
+            }
+#endif // PAD_X == 0 && PAD_Y == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (uint(KERNEL_DEPTH) - 1))
+    {
+        STORE4(dst, output_offset, 1.0f);
+    }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP(dst);
+    uint     image_size     = width * height;
+    uint     tmp_out_offset = dst.current_offset + (((pos.x + pos.y * width + pos.z * image_size) * dst.stride_x) >> 2);
+
+    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+    {
+        tmp_out_offset += (dst.stride_x >> uint(2));
+        STORE4(dst, tmp_out_offset, 1.f);
+    }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+void main(void)
+{
+    uvec2    pos = uvec2(gl_GlobalInvocationID.xy);
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    uint idx            = pos.x * dst.stride_z + (pos.y / width) * dst.stride_y + (pos.y % width) * dst.stride_x;
+    uint tmp_out_offset = dst.current_offset + (idx >> 2);
+
+    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+}
+#endif // COL2IM
+
+#else // DATA_TYPE_FP16
+#error Data type not supported
+#endif // DATA_TYPE_FP16

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
new file mode 100644
index 0000000..3a31cb8
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs

@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+#if defined(DATA_TYPE_FP32)
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    float pixels  = CONVERT(0, float);
+    uint  z_index = gl_GlobalInvocationID.z;
+    weights.current_offset += z_index * weights_stride_w >> 2;
+    float temp;
+    float temp_weight;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        temp        = LOAD4(src, CURRENT_OFFSET(src));
+        temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+        pixels += temp * temp_weight;
+
+        src.current_offset += (src_stride_z >> 2);
+        weights.current_offset += (weights_stride_z >> 2);
+    }
+
+#ifdef BIAS
+    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(Image src, float w)
+{
+    uvec4 packed_s;
+    vec4  s[2];
+
+    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
+
+    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(Image src, float w)
+{
+    uvec4 packed_s;
+    vec4  s[2];
+    vec4  r[2];
+
+    GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
+    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+    r[0] = vec4(s[0].xz, s[1].xz);
+
+    GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
+    s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
+    s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+
+    r[1] = vec4(s[0].xz, s[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    pixels[0] = vec4(0.f);
+    pixels[1] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    uint  packed_w;
+    float w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
+        w = unpackHalf2x16(packed_w).x;
+
+        vec4 r[2] = CONVOLVE(src, w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+
+    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    pixels[0] += vec4(b);
+    pixels[1] += vec4(b);
+#endif /* BIAS */
+
+    uvec4 packed_d;
+    packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
+                     packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+#else  /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
new file mode 100644
index 0000000..67b92cb
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs

@@ -0,0 +1,1583 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+#define LOAD12(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2))
+
+#define LOAD3X3(r, name)                                \
+    r[0] = LOAD4(name, tensor3D_offset(name, 0, 0, 0)); \
+    r[1] = LOAD4(name, tensor3D_offset(name, 1, 0, 0)); \
+    r[2] = LOAD4(name, tensor3D_offset(name, 2, 0, 0)); \
+    r[3] = LOAD4(name, tensor3D_offset(name, 0, 1, 0)); \
+    r[4] = LOAD4(name, tensor3D_offset(name, 1, 1, 0)); \
+    r[5] = LOAD4(name, tensor3D_offset(name, 2, 1, 0)); \
+    r[6] = LOAD4(name, tensor3D_offset(name, 0, 2, 0)); \
+    r[7] = LOAD4(name, tensor3D_offset(name, 1, 2, 0)); \
+    r[8] = LOAD4(name, tensor3D_offset(name, 2, 2, 0))
+
+#if defined(PROCESS_1_ELEMENT)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    float pixels = CONVERT(0, float);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        vec3 temp;
+        vec3 w;
+
+        LOAD12(temp, src, offset(src, 0, 0));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        LOAD12(temp, src, offset(src, 0, 1));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        LOAD12(temp, src, offset(src, 0, 2));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+#elif defined(PROCESS_8_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve1x3_stride1(uint offset, vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 tmp[3];
+    vec4 r[2];
+
+    LOAD3(tmp, src, offset);
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    middle = vec4(tmp[1].yzw, tmp[2].x);
+    right  = vec4(tmp[1].zw, tmp[2].xy);
+
+    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] convolve1x3_stride2(uint offset, vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+    vec4 tmp[3];
+    vec4 r[2];
+
+    LOAD3(tmp, src, offset);
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    r[0] = left * w[0] + middle * w[1] + right * w[2];
+
+    LOAD2(tmp, src, offset + ((uint(3) * src_stride_x) >> 2));
+
+    left   = vec4(tmp[2].xz, tmp[0].xz);
+    middle = vec4(tmp[2].yw, tmp[0].yw);
+    right  = vec4(tmp[2].z, tmp[0].xz, tmp[1].x);
+
+    r[1] = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    pixels[0] = vec4(0);
+    pixels[1] = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w;
+        vec4 r[2];
+
+        // first line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        r = CONVOLVE1x3(src.current_offset >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        // second line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        // third line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    pixels[0] += vec4(b);
+    pixels[1] += vec4(b);
+#endif /* BIAS */
+
+    STORE2(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_4_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(uint offset, vec3 w)
+{
+    vec4 tmp[2];
+    vec4 middle;
+    vec4 right;
+
+    LOAD2(tmp, src, offset);
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return tmp[1];
+}
+
+vec4 convolve1x3_stride2(uint offset, vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+
+    vec4 tmp[3];
+
+    LOAD3(tmp, src, offset);
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    tmp[0] = left * w[0] + middle * w[1] + right * w[2];
+
+    return tmp[0];
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels;
+    pixels = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w;
+
+        // first line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        pixels += CONVOLVE1x3(src.current_offset >> uint(2), w);
+
+        // second line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+
+        // third line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    pixels += vec4(b);
+#endif /* BIAS */
+
+    STORE1(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
+
+vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
+{
+    vec4 r;
+
+    r = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels[3];
+    pixels[0] = vec4(0);
+    pixels[1] = vec4(0);
+    pixels[2] = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w[3];
+
+        LOAD3(w[0], weights, tensor3D_offset(weights, 0, 0, 0));
+        LOAD3(w[1], weights, tensor3D_offset(weights, 0, 1, 0));
+        LOAD3(w[2], weights, tensor3D_offset(weights, 0, 2, 0));
+
+        vec4 s[2];
+        vec4 middle;
+        vec4 right;
+        // first line
+        LOAD2(s, src, src.current_offset >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // second line
+        LOAD2(s, src, (src.current_offset + (src_stride_y >> 2)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // third line
+        LOAD2(s, src, (src.current_offset + (src_stride_y >> 1)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // forth line
+        LOAD2(s, src, (src.current_offset + (uint(3) * (src_stride_y >> 2))) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
+
+        // fifth line
+        LOAD2(s, src, (src.current_offset + (src_stride_y)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+
+    pixels[0] += vec4(b);
+    pixels[1] += vec4(b);
+    pixels[2] += vec4(b);
+#endif /* BIAS */
+
+    STORE1(dst, dst.current_offset >> uint(2), pixels[0]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y >> 2)) >> uint(2), pixels[1]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y >> 1)) >> uint(2), pixels[2]);
+}
+#elif defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r[2];
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    middle = vec4(tmp[1].yzw, tmp[2].x);
+    right  = vec4(tmp[1].zw, tmp[2].xy);
+
+    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[3] load_and_unpack(uint offset)
+{
+    uvec4 packed_s[2];
+    vec4  s[3];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+    ;
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
+    s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d[2];
+    uvec4 vd;
+
+    vec4 pixels[3][2];
+    int  i, j;
+    for(i = 0; i < 3; i++)
+    {
+        for(j = 0; j < 2; j++)
+        {
+            pixels[i][j] = vec4(0);
+        }
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        uvec4 packed_s[2];
+        vec4  s[3];
+        vec4  r[2];
+        uint  offset;
+        // first line
+        offset = src.current_offset >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 3; i++)
+    {
+        for(j = 0; j < 2; j++)
+        {
+            pixels[i][j] += vec4(b);
+        }
+    }
+#endif /* BIAS */
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[0][0].xy), packHalf2x16(pixels[0][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[0][1].xy), packHalf2x16(pixels[0][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, dst.current_offset >> uint(4), vd);
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[1][0].xy), packHalf2x16(pixels[1][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[1][1].xy), packHalf2x16(pixels[1][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(4), vd);
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[2][0].xy), packHalf2x16(pixels[2][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[2][1].xy), packHalf2x16(pixels[2][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(4), vd);
+}
+#elif defined(PROCESS_X_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride2(offset)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride1(offset)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    r = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack_stride1(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+vec4[3] load_and_unpack_stride2(uint offset)
+{
+    uvec2 packed_s[3];
+    vec4  s[3];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+    LOAD1(packed_s[2], src, offset + uint(2));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+#if STRIDE_X == 2
+        vec4 s[3];
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+        vec4 s[2];
+#else               /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[1]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    pixels += vec4(b);
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels.xy), packHalf2x16(pixels.zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[3];
+    int  i;
+
+    for(i = 0; i < 3; i++)
+    {
+        pixels[i] = vec4(0);
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        vec4 s[2];
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[1]);
+        pixels[1] += CONVOLVE1x3(s, w[0]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[2]);
+        pixels[1] += CONVOLVE1x3(s, w[1]);
+        pixels[2] += CONVOLVE1x3(s, w[0]);
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[1] += CONVOLVE1x3(s, w[2]);
+        pixels[2] += CONVOLVE1x3(s, w[1]);
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[2] += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 3; i++)
+    {
+        pixels[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[4];
+    int  i;
+
+    for(i = 0; i < 4; i++)
+    {
+        pixels[i] = vec4(0);
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        vec4 s[2];
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[1]);
+        pixels[1] += CONVOLVE1x3(s, w[0]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[2]);
+        pixels[1] += CONVOLVE1x3(s, w[1]);
+        pixels[2] += CONVOLVE1x3(s, w[0]);
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[1] += CONVOLVE1x3(s, w[2]);
+        pixels[2] += CONVOLVE1x3(s, w[1]);
+        pixels[3] += CONVOLVE1x3(s, w[0]);
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[2] += CONVOLVE1x3(s, w[2]);
+        pixels[3] += CONVOLVE1x3(s, w[1]);
+
+        // sixth line
+        offset = (src.current_offset + uint(5) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[3] += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 4; i++)
+    {
+        pixels[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[3].xy), packHalf2x16(pixels[3].zw));
+    STORE1(dst, (dst.current_offset + uint(3) * (dst_stride_y)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3x2 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[3];
+    int  i;
+
+    uint z_base_index = gl_GlobalInvocationID.z << 1;
+
+    // store orginal src current offset
+    uint s_offset = src.current_offset;
+
+    weights.current_offset += z_base_index * weights_stride_w;
+
+    for(int z = 0; z < 2; ++z)
+    {
+        uint z_index = z_base_index + uint(z);
+
+        src.current_offset = s_offset;
+        //weights.current_offset = z_index * weights_stride_w;
+
+        for(i = 0; i < 3; i++)
+        {
+            pixels[i] = vec4(0);
+        }
+
+        for(int d = 0; d < int(weights_depth); ++d)
+        {
+            // load 3 weights once
+            uvec2 packed_w[3];
+
+            LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+            LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+            LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+            vec3 w[3];
+            w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+            w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+            w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+            vec4 s[2];
+            vec4 r;
+            uint offset;
+            // first line
+            offset = src.current_offset >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[0]);
+
+            // second line
+            offset = (src.current_offset + src_stride_y) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[1]);
+            pixels[1] += CONVOLVE1x3(s, w[0]);
+
+            // third line
+            offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[2]);
+            pixels[1] += CONVOLVE1x3(s, w[1]);
+            pixels[2] += CONVOLVE1x3(s, w[0]);
+
+            // forth line
+            offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[1] += CONVOLVE1x3(s, w[2]);
+            pixels[2] += CONVOLVE1x3(s, w[1]);
+
+            // fifth line
+            offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[2] += CONVOLVE1x3(s, w[2]);
+
+            src.current_offset += src_stride_z;
+            weights.current_offset += weights_stride_z;
+        }
+
+#ifdef BIAS
+        uint  packed_b;
+        float b;
+        LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+        if(z_index % uint(2) == uint(0))
+        {
+            b = unpackHalf2x16(packed_b).x;
+        }
+        else
+        {
+            b = unpackHalf2x16(packed_b).y;
+        }
+
+        for(i = 0; i < 3; i++)
+        {
+            pixels[i] += vec4(b);
+        }
+#endif /* BIAS */
+
+        packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+        STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+        packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+        STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+        packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+        STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+        dst.current_offset += dst_stride_z;
+    }
+}
+#endif /* PROCESS_1_ELEMENT */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
new file mode 100644
index 0000000..4fdbf0d
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs

@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define LOAD20(r, name, offset)           \
+    r[0] = LOAD4(name, offset);           \
+    r[1] = LOAD4(name, offset + uint(1)); \
+    r[2] = LOAD4(name, offset + uint(2)); \
+    r[3] = LOAD4(name, offset + uint(3)); \
+    r[4] = LOAD4(name, offset + uint(4))
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    float pixels  = CONVERT(0, float);
+    uint  z_index = gl_GlobalInvocationID.z;
+    weights.current_offset += z_index * weights_stride_w >> 2;
+    float temp[5];
+    float temp_weight[5];
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        LOAD20(temp, src, offset(src, 0, 0));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 1));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 2));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 3));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 4));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        src.current_offset += (src_stride_z >> 2);
+        weights.current_offset += (weights_stride_z >> 2);
+    }
+
+#ifdef BIAS
+    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[2] load_src_stride1(Image src, int row)
+{
+    uvec2 packed[2];
+    vec4  ret[2];
+
+    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+    return ret;
+}
+
+vec4[3] load_src_stride2(Image src, int row)
+{
+    uvec2 packed[3];
+    vec4  ret[3];
+
+    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
+
+    return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+    uvec3 packed_w;
+    vec2  ret[3];
+
+    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+    return ret;
+}
+
+// output 4 element per thread
+vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
+{
+    vec4 src0 = tmp[0];
+    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+    vec4 src4 = tmp[1];
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
+{
+    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
+    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
+    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
+    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4  res = vec4(0);
+    vec2  w[3];
+    vec4  s[STRIDE_X + 1];
+    uvec2 packed_d;
+    uint  z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        for(int row = 0; row < 5; row++)
+        {
+            w = load_weight(weights, row);
+            s = LOAD_SRC(src, row);
+            res += CONVOLVE1x5(s, w);
+        }
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+
+    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+    res += vec4(b);
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
+    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
new file mode 100644
index 0000000..54e08b1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs

@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(mask);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+uint hash(uint x)
+{
+    x += (x << 10u);
+    x ^= (x >> 6u);
+    x += (x << 3u);
+    x ^= (x >> 11u);
+    x += (x << 15u);
+    return x;
+}
+
+uint hash(uvec3 v)
+{
+    return hash(v.x ^ hash(v.y) ^ hash(v.z));
+}
+
+float float_construct(uint m)
+{
+    const uint ieee_mantissa = 0x007FFFFFu;
+    const uint ieee_one      = 0x3F800000u;
+
+    m &= ieee_mantissa;
+    m |= ieee_one;
+
+    float f = uintBitsToFloat(m);
+    return f - 1.0;
+}
+
+float rand(vec3 v, float seed)
+{
+    return float_construct(hash(floatBitsToUint(v + seed)));
+}
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(mask, 2, float, );
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
+ * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float random  = 0.f;
+    float inputv  = 0.f;
+    float maskv   = 0.f;
+    float outputv = 0.f;
+
+#ifdef FORWARD
+    random = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+    maskv  = (random > RATIO) ? 1.f : 0.f;
+    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else  /* FORWARD */
+    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#endif /* FORWARD */
+
+    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+    outputv = maskv * inputv * float(SCALE);
+    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(mask, 2, uint, );
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
+ * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float random1    = 0.f;
+    float random2    = 0.f;
+    uint  inputv     = uint(0);
+    uint  outputv    = uint(0);
+    uint  maskv      = uint(0);
+    vec2  input_vec  = vec2(0, 0);
+    vec2  output_vec = vec2(0, 0);
+    vec2  mask_vec   = vec2(0, 0);
+
+#ifdef FORWARD
+    random1          = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+    random2          = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED);
+    mask_vec.x       = (random1 > RATIO) ? 1.f : 0.f;
+    mask_vec.y       = (random2 > RATIO) ? 1.f : 0.f;
+    maskv            = packHalf2x16(mask_vec);
+    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else  /* FORWARD */
+    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+    mask_vec = unpackHalf2x16(maskv);
+#endif /* FORWARD */
+
+    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+
+    input_vec  = unpackHalf2x16(inputv);
+    output_vec = mask_vec * input_vec * float(SCALE);
+    outputv    = packHalf2x16(output_vec);
+
+    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#else /* DATA_TYPE_FP32 */
+
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
new file mode 100644
index 0000000..01a3986
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs

@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+#ifdef FILL_IMAGE_BORDERS_REPLICATE
+BUFFER_DECLARATION(buf, 1, float, restrict);
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(buf);
+    uint width;
+    uint height;
+    int  start_pos_x;
+    int  start_pos_y;
+};
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos_x                       X coordinate indicating the start point of the valid region
+ * @param[in]     start_pos_y                       Y coordinate indicating the start point of the valid region
+ */
+void main()
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2));
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        float left_val = LOAD4(buf, offset(buf, 0, gidH));
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            STORE4(buf, offset(buf, i, gidH), left_val);
+        }
+        // Handle right border
+        float right_val = LOAD4(buf, offset(buf, int(width) - 1, gidH));
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            STORE4(buf, offset(buf, int(width) + i, gidH), right_val);
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (int(width) - 1))
+        {
+            val_idx = gidW < 0 ? 0 : int(width) - 1;
+        }
+
+        // Handle top border
+        float top_val = LOAD4(buf, offset(buf, val_idx, 0));
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            STORE4(buf, offset(buf, gidW, i), top_val);
+        }
+        // Handle bottom border
+        float bottom_val = LOAD4(buf, offset(buf, val_idx, int(height) - 1));
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            STORE4(buf, offset(buf, gidW, int(height) + i), bottom_val);
+        }
+    }
+}
+#endif /* FILL_IMAGE_BORDERS_REPLICATE */
+
+#ifdef FILL_IMAGE_BORDERS_CONSTANT
+BUFFER_DECLARATION(buf, 1, float, writeonly);
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(buf);
+    uint  width;
+    uint  height;
+    int   start_pos_x;
+    int   start_pos_y;
+    float constant_value;
+};
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos_x                       X coordinate indicating the start point of the valid region
+ * @param[in]  start_pos_y                       Y coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+void main()
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2));
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            STORE1(buf, offset(buf, i, gidH), constant_value);
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            STORE1(buf, offset(buf, int(width) + i, gidH), constant_value);
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            STORE1(buf, offset(buf, gidW, i), constant_value);
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            STORE1(buf, offset(buf, gidW, int(height) + i), constant_value);
+        }
+    }
+}
+#endif /* FILL_IMAGE_BORDERS_CONSTANT */
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+#ifdef FILL_IMAGE_BORDERS_REPLICATE
+BUFFER_DECLARATION(buf, 1, uint, restrict);
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(buf);
+    uint width;
+    uint height;
+    int  start_pos_x;
+    int  start_pos_y;
+};
+
+void set_replicate(uint offset, int pos, uint replicate_value)
+{
+    uint packed_b;
+    LOAD1(packed_b, buf, offset);
+
+    vec2 b = unpackHalf2x16(packed_b);
+    vec2 c = unpackHalf2x16(replicate_value);
+
+    if(pos % 2 == 0)
+    {
+        b.x = c.y;
+    }
+    else
+    {
+        b.y = c.x;
+    }
+
+    packed_b = packHalf2x16(b);
+
+    STORE1(buf, offset, packed_b);
+}
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos_x                       X coordinate indicating the start point of the valid region
+ * @param[in]     start_pos_y                       Y coordinate indicating the start point of the valid region
+ */
+void main()
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.current_offset = uint(buf.current_offset + uint(start_pos_y) * buf_stride_y + uint(start_pos_x) * buf_stride_x);
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        uint left_val;
+        LOAD1(left_val, buf, offset_fp16(buf, 0, gidH) >> uint(2));
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            uint offset = offset_fp16(buf, i, gidH) >> 2;
+            int  pos    = i + BORDER_SIZE_LEFT;
+            if(i == -1)
+            {
+                if(pos % 2 == 0)
+                {
+                    set_replicate(offset, pos, left_val);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    vec2 a = unpackHalf2x16(left_val);
+                    uint b = packHalf2x16(a.xx);
+                    STORE1(buf, offset, b);
+                }
+            }
+        }
+        // Handle right border
+        uint right_val;
+        LOAD1(right_val, buf, offset_fp16(buf, int(width) - 1, gidH) >> uint(2));
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2;
+            int  pos    = i + BORDER_SIZE_LEFT + int(width);
+
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    vec2 a = unpackHalf2x16(right_val);
+                    uint b = packHalf2x16(a.yy);
+                    STORE1(buf, offset, b);
+                }
+                else
+                {
+                    set_replicate(offset, pos, right_val);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    vec2 a = unpackHalf2x16(right_val);
+                    uint b = packHalf2x16(a.yy);
+                    STORE1(buf, offset, b);
+                }
+            }
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || (gidW > (int(width) - 1)))
+        {
+            val_idx = gidW < 0 ? 0 : (int(width) - 1);
+        }
+
+        // Handle top border
+        uint top_val;
+        LOAD1(top_val, buf, offset_fp16(buf, val_idx, 0) >> uint(2));
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            uint offset = offset_fp16(buf, gidW, i) >> 2;
+
+            if(gid0 % 2 == 0)
+            {
+                if(gidW == (int(width) - 1))
+                {
+                    vec2 a = unpackHalf2x16(top_val);
+                    uint b = packHalf2x16(a.xx);
+                    STORE1(buf, offset, b);
+                }
+                else
+                {
+                    if(gidW < 0)
+                    {
+                        vec2 a = unpackHalf2x16(top_val);
+                        uint b;
+                        if(BORDER_SIZE_LEFT % 2 == 0)
+                        {
+                            b = packHalf2x16(a.xx);
+                        }
+                        else
+                        {
+                            b = packHalf2x16(a.yy);
+                        }
+                        STORE1(buf, offset, b);
+                    }
+                    else if(gidW >= int(width))
+                    {
+                        vec2 a = unpackHalf2x16(top_val);
+                        uint b;
+                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+                        {
+                            b = packHalf2x16(a.yy);
+                        }
+                        STORE1(buf, offset, b);
+                    }
+                    else
+                    {
+                        STORE1(buf, offset, top_val);
+                    }
+                }
+            }
+        }
+        // Handle bottom border
+        uint bottom_val;
+        LOAD1(bottom_val, buf, offset_fp16(buf, val_idx, int(height) - 1) >> uint(2));
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2;
+
+            if(gid0 % 2 == 0)
+            {
+                if(gidW == (int(width) - 1))
+                {
+                    vec2 a = unpackHalf2x16(bottom_val);
+                    uint b = packHalf2x16(a.xx);
+                    STORE1(buf, offset, b);
+                }
+                else
+                {
+                    if(gidW < 0)
+                    {
+                        vec2 a = unpackHalf2x16(bottom_val);
+                        uint b;
+                        if(BORDER_SIZE_LEFT % 2 == 0)
+                        {
+                            b = packHalf2x16(a.xx);
+                        }
+                        else
+                        {
+                            b = packHalf2x16(a.yy);
+                        }
+                        STORE1(buf, offset, b);
+                    }
+                    else if(gidW >= int(width))
+                    {
+                        vec2 a = unpackHalf2x16(bottom_val);
+                        uint b;
+                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+                        {
+                            b = packHalf2x16(a.yy);
+                        }
+                        STORE1(buf, offset, b);
+                    }
+                    else
+                    {
+                        STORE1(buf, offset, bottom_val);
+                    }
+                }
+            }
+        }
+    }
+}
+#endif /* FILL_IMAGE_BORDERS_REPLICATE */
+
+#ifdef FILL_IMAGE_BORDERS_CONSTANT
+BUFFER_DECLARATION(buf, 1, uint, restrict);
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(buf);
+    uint  width;
+    uint  height;
+    int   start_pos_x;
+    int   start_pos_y;
+    float constant_value;
+};
+
+void set_constant(uint offset, int pos)
+{
+    uint packed_b;
+    LOAD1(packed_b, buf, offset);
+
+    vec2 b = unpackHalf2x16(packed_b);
+
+    if(pos % 2 == 0)
+    {
+        b.x = constant_value;
+    }
+    else
+    {
+        b.y = constant_value;
+    }
+
+    packed_b = packHalf2x16(b);
+
+    STORE1(buf, offset, packed_b);
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos_x                       X coordinate indicating the start point of the valid region
+ * @param[in]  start_pos_y                       Y coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+void main()
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf);
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    // Update pointer to point to the starting point of the valid region
+    buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x))));
+
+    vec2 b = vec2(constant_value, constant_value);
+
+    uint packed_b = packHalf2x16(b);
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            uint offset = offset_fp16(buf, i, gidH) >> 2;
+            int  pos    = i + BORDER_SIZE_LEFT;
+
+            if(i == -1)
+            {
+                if(pos % 2 == 0)
+                {
+                    set_constant(offset, pos);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE1(buf, offset, packed_b);
+                }
+            }
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2;
+            int  pos    = i + BORDER_SIZE_LEFT + int(width);
+
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE1(buf, offset, packed_b);
+                }
+                else
+                {
+                    set_constant(offset, pos);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE1(buf, offset, packed_b);
+                }
+            }
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            uint offset = offset_fp16(buf, gidW, i) >> 2;
+
+            if(gid0 % 2 == 0)
+            {
+                STORE1(buf, offset, packed_b);
+            }
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2;
+
+            if(gid0 % 2 == 0)
+            {
+                STORE1(buf, offset, packed_b);
+            }
+        }
+    }
+}
+#endif /* FILL_IMAGE_BORDERS_CONSTANT */
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
new file mode 100755
index 0000000..3313b88
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+#define LOAD8(r, name, offset) \
+    r.x = LOAD4(name, offset); \
+    r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2)); \
+    r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r)         \
+    STORE4(name, offset, r.x);           \
+    STORE4(name, offset + uint(1), r.y); \
+    STORE4(name, offset + uint(2), r.z); \
+    STORE4(name, offset + uint(3), r.w)
+
+#ifdef GEMM_TRANSPOSE1xW
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2;
+    vec4 b0;
+    LOAD16(b0, src, offset(src, 0, 0));
+    STORE16(dst, dst_addr_in_bytes, b0);
+}
+#endif /* GEMM_TRANSPOSE1xW */
+
+#ifdef GEMM_INTERLEAVE4x4
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGLES kernel reshapes the input matrix interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int i;
+    int j;
+
+    for(i = 0; i < 4; ++i)
+    {
+        for(j = 0; j < 4; ++j)
+        {
+            float res    = LOAD4(src, offset(src, i, j));
+            uint  ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j);
+            STORE4(dst, ofset0, res);
+        }
+    }
+}
+#endif /* GEMM_INTERLEAVE4x4 */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+BUFFER_DECLARATION(accum, 1, float, restrict);
+BUFFER_DECLARATION(biases, 2, float, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    for(int i = 0; i < 16; ++i)
+    {
+        float accum_value  = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i));
+        float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i));
+        accum_value        = biases_value + accum_value;
+
+        // Store result in the accummulate buffer
+        STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value);
+    }
+}
+#endif /* GEMM_ACCUMULATE_BIASES */
+
+#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Compute address for matrix A and B */
+    src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);
+    src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);
+
+    /* Reset accumulators */
+    vec4 c00 = vec4(0.0f);
+    vec4 c10 = vec4(0.0f);
+    vec4 c20 = vec4(0.0f);
+    vec4 c30 = vec4(0.0f);
+
+    // FIXME: loop unrolling really needed for GLES?
+    for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0;
+        vec4 b0;
+        LOAD16(a0, src0, src0.current_offset);
+        LOAD16(b0, src1, src1.current_offset);
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        LOAD16(a0, src0, src0.current_offset + uint(4));
+        LOAD16(b0, src1, src1.current_offset + uint(4));
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+    }
+
+    for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0;
+        vec4 b0;
+        LOAD16(a0, src0, src0.current_offset);
+        LOAD16(b0, src1, src1.current_offset);
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+    }
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * vec4(ALPHA);
+    c10 = c10 * vec4(ALPHA);
+    c20 = c20 * vec4(ALPHA);
+    c30 = c30 * vec4(ALPHA);
+
+    /* Store 4x4 block */
+    STORE16(dst, offset(dst, 0, 0), c00);
+    STORE16(dst, offset(dst, 0, 1), c10);
+    STORE16(dst, offset(dst, 0, 2), c20);
+    STORE16(dst, offset(dst, 0, 3), c30);
+}
+#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
+
+#ifdef GEMM_MM_FLOATING_POINT
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);
+    src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);
+
+    /* Compute end row address for matrix A */
+    int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))
+    {
+        vec2 a0;
+        LOAD8(a0, src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+        LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+        LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+        LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        vec4 b0;
+        vec4 b1;
+        LOAD16(b0, src1, src1.current_offset);
+        LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));
+
+        acc0 += b0 * vec4(a0.x);
+        acc0 += b1 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+        acc1 += b1 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+        acc2 += b1 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+        acc3 += b1 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))
+    {
+        // Load values from matrix A
+        float a0;
+        a0 = LOAD4(src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1;
+        a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2;
+        a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3;
+        a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        vec4 b0;
+        LOAD16(b0, src1, src1.current_offset);
+
+        acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 0), acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = acc1 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = acc2 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = acc3 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_MATRIXADDITION
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using BETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    vec4 alpha_ab;
+    vec4 c;
+    vec4 out1;
+
+    LOAD16(alpha_ab, dst, dst.current_offset);
+    LOAD16(c, src, src.current_offset);
+
+    /* Computes alpha * axb + beta * c */
+    out1 = alpha_ab + vec4(BETA * c);
+
+    /* Store final result in axb matrix */
+    STORE16(dst, dst.current_offset, out1);
+}
+#endif /* GEMM_MATRIXADDITION */
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+#ifdef GEMM_MM_FLOATING_POINT
+BUFFER_DECLARATION(src0, 1, uint, readonly);
+BUFFER_DECLARATION(src1, 2, uvec2, readonly);
+BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+
+    for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
+    {
+        uint packed_a0;
+        vec2 a0;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
+        a0 = vec2(unpackHalf2x16(packed_a0));
+
+        uvec2 packed_b0;
+        uvec2 packed_b1;
+        vec4  b0;
+        vec4  b1;
+
+        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+        GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);
+
+        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+        b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));
+
+        acc0 += b0 * vec4(a0.x);
+        acc0 += b1 * vec4(a0.y);
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
+    {
+        uint packed_a0;
+        vec2 a0;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
+        a0 = vec2(unpackHalf2x16(packed_a0));
+
+        uvec2 packed_b0;
+        vec4  b0;
+
+        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+
+        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+
+        acc0 += b0 * (a0.x);
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+
+    uvec2 packed_d;
+    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+}
+#endif /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+BUFFER_DECLARATION(accum, 1, uvec2, restrict);
+BUFFER_DECLARATION(biases, 2, uvec2, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+
+    vec4  u[2];
+    uvec2 packed_s[2];
+    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
+    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    vec4 tmp;
+    tmp         = u[0] + u[1];
+    packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
+    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+}
+#endif /* GEMM_ACCUMULATE_BIASES */
+#else  /* DATA_TYPE_F32 */
+#error Data type not supported
+#endif /* DATA_TYPE_F32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
new file mode 100644
index 0000000..86dedf5
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h

@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT(x, type) type(x)
+
+#define PACK(value, stype, dtype) \
+    pack_##stype##_##dtype(value)
+
+#define UNPACK(value, stype, dtype) \
+    unpack_##stype##_##dtype(value)
+
+#define BUFFER_DECLARATION(name, location, type, access)          \
+    layout(std430, binding = location) access buffer name##Buffer \
+    {                                                             \
+        type name##_ptr[];                                        \
+    }
+
+#define VECTOR_PARAM_DECLARATION(name)         \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size
+
+#define IMAGE_PARAM_DECLARATION(name)          \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_stride_y;                      \
+    uint name##_step_y;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size
+
+#define TENSOR3D_PARAM_DECLARATION(name)       \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_stride_y;                      \
+    uint name##_step_y;                        \
+    uint name##_stride_z;                      \
+    uint name##_step_z;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size
+
+/** Structure to hold Vector information */
+struct Vector
+{
+    uint current_offset;                /**< Current offset of vector */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+};
+
+/** Structure to hold Image information */
+struct Image
+{
+    uint current_offset;                /**< Current offset of image */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+};
+
+/** Structure to hold 3D tensor information */
+struct Tensor3D
+{
+    uint current_offset;                /**< Current offset of tensor */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    uint stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+};
+
+/////////////////////////////////////////////////////////////
+// TODO: old to be removed
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_FP16(name) \
+    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(name) \
+    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_FP16(name) \
+    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_FP16(name) \
+    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
+    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                    name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_FP16(name)                                                                                                  \
+    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                         name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(name) \
+    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+// FIXME: Redesign the macros if different data types are supported.
+#define LOAD4(name, offset) \
+    name##_ptr[offset]
+
+#define STORE4(name, offset, value) \
+    name##_ptr[offset] = value
+
+// Load 1 element, which size is determined by ssbo type.
+#define LOAD1(r, name, offset) \
+    r = name##_ptr[offset]
+
+#define STORE1(name, offset, value) \
+    name##_ptr[offset] = value
+
+#define LOAD2(r, name, offset) \
+    LOAD1(r[0], name, offset); \
+    LOAD1(r[1], name, (offset) + uint(1))
+
+#define STORE2(name, offset, value)            \
+    name##_ptr[offset]             = value[0]; \
+    name##_ptr[(offset) + uint(1)] = value[1]
+
+#define LOAD3(r, name, offset)             \
+    LOAD1(r[0], name, offset);             \
+    LOAD1(r[1], name, (offset) + uint(1)); \
+    LOAD1(r[2], name, (offset) + uint(2))
+
+#define CURRENT_OFFSET(name) \
+    name.current_offset
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = (vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x) >> 2;
+
+    return vector;
+}
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y) >> 2;
+
+    return img;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = (tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+    return tensor;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+uint vector_offset(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset << 2, int) + x * CONVERT(vec.stride_x, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+uint vector_offset_fp16(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+uint offset(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset << 2, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+uint offset_fp16(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+uint tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset << 2, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+uint tensor3D_offset_fp16(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+/////////////////////////////////////////////////////////////
+// new one
+
+#define GC_CONVERT_TO_VECTOR_STRUCT(name) \
+    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define GC_CONVERT_TO_IMAGE_STRUCT(name) \
+    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define GC_CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
+    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                       name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+Vector gc_update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+    return vector;
+}
+
+Image gc_update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+    return img;
+}
+
+Tensor3D gc_update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return tensor;
+}
+
+Image gc_update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return img;
+}
+
+#define GC_CURRENT_OFFSET(name) \
+    name.current_offset
+
+uint gc_vector_offset(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+uint gc_image_offset(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+uint gc_tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+// load/store number of element depends on buffer type
+#define GC_LOAD1(r, name, offset) \
+    r = name##_ptr[offset]
+
+#define GC_LOAD2(r, name, offset) \
+    GC_LOAD1(r[0], name, offset); \
+    GC_LOAD1(r[1], name, (offset) + uint(1))
+
+#define GC_LOAD3(r, name, offset)             \
+    GC_LOAD1(r[0], name, offset);             \
+    GC_LOAD1(r[1], name, (offset) + uint(1)); \
+    GC_LOAD1(r[2], name, (offset) + uint(2))
+
+#define GC_STORE1(value, name, offset) \
+    name##_ptr[offset] = value
+
+#define GC_STORE2(value, name, offset) \
+    GC_STORE1(value[0], name, offset); \
+    GC_STORE1(value[1], name, (offset) + uint(1))
+
+#define GC_STORE3(value, name, offset)             \
+    GC_STORE1(value[0], name, offset);             \
+    GC_STORE1(value[1], name, (offset) + uint(1)); \
+    GC_STORE1(value[2], name, (offset) + uint(2))
+
+// has to manually expand them since not supported by compiler
+#define GC_LOAD1_1D_OFFSET(r, name, x) \
+    GC_LOAD1(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_2D_OFFSET(r, name, x, y) \
+    GC_LOAD1(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD1(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_1D_OFFSET(value, name, x) \
+    GC_STORE1(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_2D_OFFSET(value, name, x, y) \
+    GC_STORE1(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_3D_OFFSET(value, name, x, y, z) \
+    GC_STORE1(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_1D_OFFSET(r, name, x) \
+    GC_LOAD2(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_2D_OFFSET(r, name, x, y) \
+    GC_LOAD2(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD2(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_1D_OFFSET(value, name, x) \
+    GC_STORE2(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_2D_OFFSET(value, name, x, y) \
+    GC_STORE2(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_3D_OFFSET(value, name, x, y, z) \
+    GC_STORE2(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_1D_OFFSET(r, name, x) \
+    GC_LOAD3(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_2D_OFFSET(r, name, x, y) \
+    GC_LOAD3(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD3(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+/////////////////////////////////////////////////////////////
+
+#endif // _HELPER_H

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
new file mode 100755
index 0000000..5699340
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src1);
+    TENSOR3D_PARAM_DECLARATION(src2);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+#ifdef CROSS_MAP
+/** Apply cross map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in]  src1_ptr                                    Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                               Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                                 src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                               Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                                 src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                               Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                                 src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes          The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                                    Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                               Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                                 src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                               Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                                 src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                               Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                                 src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes          The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                                     Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                                Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                  dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                  dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                  dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes           The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float acc = 0.0;
+
+    int num_of_slices = int(gl_NumWorkGroups.z * gl_WorkGroupSize.z);
+    int current_slice = int(gl_GlobalInvocationID.z);
+
+    int left_slice  = max(current_slice - int(RADIUS), int(0));
+    int right_slice = min(current_slice + int(RADIUS), int(num_of_slices - 1));
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        acc += src2_ptr[tensor3D_offset(src2, 0, 0, i - current_slice)];
+    }
+
+    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+    dst_ptr[dst.current_offset] = normalized_pixel;
+}
+
+#elif defined(IN_MAP_1D)
+/** Apply in map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in]  src1_ptr                                    Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                               Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                                 src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                               Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                                 src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                               Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                                 src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes          The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                                    Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                               Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                                 src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                               Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                                 src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                               Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                                 src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes          The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                                     Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                                Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                  dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                  dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                  dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes           The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float acc = 0.0;
+
+    int num_of_items_x = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);
+    int current_pos    = int(gl_GlobalInvocationID.x);
+
+    int left_pos  = max(current_pos - int(RADIUS), int(0));
+    int right_pos = min(current_pos + int(RADIUS), int(num_of_items_x + -1));
+
+    for(int i = left_pos; i <= right_pos; i++)
+    {
+        acc += src2_ptr[tensor3D_offset(src2, i - current_pos, 0, 0)];
+    }
+
+    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+    dst_ptr[dst.current_offset] = normalized_pixel;
+}
+#endif /*CROSS_MAP*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
new file mode 100644
index 0000000..031687a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src1);
+    TENSOR3D_PARAM_DECLARATION(src2);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @param[in]  src1_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src2_ptr                           Pointer to the source image. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  scale                              Float scaling factor. Supported data types: F32
+ */
+void main()
+{
+    // Get pixels pointer
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    dst_ptr[dst.current_offset] = (src1_ptr[src1.current_offset] * src2_ptr[src2.current_offset] * float(SCALE));
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
new file mode 100644
index 0000000..1e0fee4
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs

@@ -0,0 +1,1444 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD8(r, name, offset) \
+    r.x = LOAD4(name, offset); \
+    r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2)); \
+    r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r)         \
+    STORE4(name, offset, r.x);           \
+    STORE4(name, offset + uint(1), r.y); \
+    STORE4(name, offset + uint(2), r.z); \
+    STORE4(name, offset + uint(3), r.w)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b)        \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }                             \
+    if(isnan(a.z) || (a.z < b.z)) \
+    {                             \
+        res.z = b.z;              \
+    }                             \
+    if(isnan(a.w) || (a.w < b.w)) \
+    {                             \
+        res.w = b.w;              \
+    }
+#define POOL_OP_float(res, a, b) \
+    (res) = (a);                 \
+    if(isnan(a) || (a < b))      \
+    {                            \
+        res = b;                 \
+    }
+#define POOL_OP_vec2(res, a, b)   \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -3.402823466385289e+38
+#endif // POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                     \
+    vec4 data00;                                                                                                   \
+    vec2 data01;                                                                                                   \
+    vec4 data10;                                                                                                   \
+    vec2 data11;                                                                                                   \
+    vec4 data20;                                                                                                   \
+    vec2 data21;                                                                                                   \
+    LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0));                                                        \
+    LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                               \
+    LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0));                                                        \
+    LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                               \
+    LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0));                                                        \
+    LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                               \
+    data00 = POW2_OP(data00, 4);                                                                                   \
+    data01 = POW2_OP(data01, 2);                                                                                   \
+    data10 = POW2_OP(data10, 4);                                                                                   \
+    data11 = POW2_OP(data11, 2);                                                                                   \
+    data20 = POW2_OP(data20, 4);                                                                                   \
+    data21 = POW2_OP(data21, 2);                                                                                   \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data00.xyzy;                                                                                  \
+    values001.xyzw = data00.zwzw;                                                                                  \
+    values010.x    = data01.x;                                                                                     \
+    values010.y    = data00.w;                                                                                     \
+    values010.zw   = data01.xy;                                                                                    \
+    values100.xyzw = data10.xyzy;                                                                                  \
+    values101.xyzw = data10.zwzw;                                                                                  \
+    values11.x     = data11.x;                                                                                     \
+    values11.y     = data10.w;                                                                                     \
+    values11.zw    = data11.xy;                                                                                    \
+    values200.xyzw = data20.xyzy;                                                                                  \
+    values201.xyzw = data20.zwzw;                                                                                  \
+    values21.x     = data21.x;                                                                                     \
+    values21.y     = data20.w;                                                                                     \
+    values21.zw    = data21.xy;                                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                     \
+    vec4  data000;                                                                                                 \
+    vec4  data001;                                                                                                 \
+    float data010;                                                                                                 \
+    vec4  data100;                                                                                                 \
+    vec4  data101;                                                                                                 \
+    float data11;                                                                                                  \
+    vec4  data200;                                                                                                 \
+    vec4  data201;                                                                                                 \
+    float data21;                                                                                                  \
+    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                                       \
+    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                             \
+    data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                             \
+    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                                       \
+    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                             \
+    data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                              \
+    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                                       \
+    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                             \
+    data21  = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                             \
+    data000 = POW2_OP(data000, 4);                                                                                 \
+    data001 = POW2_OP(data001, 4);                                                                                 \
+    data010 = POW2_OP(data010, 1);                                                                                 \
+    data100 = POW2_OP(data100, 4);                                                                                 \
+    data101 = POW2_OP(data101, 4);                                                                                 \
+    data11  = POW2_OP(data11, 1);                                                                                  \
+    data200 = POW2_OP(data200, 4);                                                                                 \
+    data201 = POW2_OP(data201, 4);                                                                                 \
+    data21  = POW2_OP(data21, 1);                                                                                  \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data000.xyzz;                                                                                 \
+    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
+    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
+    values100.xyzw = data100.xyzz;                                                                                 \
+    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
+    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
+    values200.xyzw = data200.xyzz;                                                                                 \
+    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
+    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                         \
+    vec4 data000;                                                                                      \
+    vec4 data001;                                                                                      \
+    vec4 data010;                                                                                      \
+    vec4 data100;                                                                                      \
+    vec4 data101;                                                                                      \
+    vec4 data11;                                                                                       \
+    vec4 data200;                                                                                      \
+    vec4 data201;                                                                                      \
+    vec4 data21;                                                                                       \
+    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                           \
+    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                 \
+    LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                 \
+    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                           \
+    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                 \
+    LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                  \
+    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                           \
+    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                 \
+    LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                  \
+    data000 = POW2_OP(data000, 4);                                                                     \
+    data001 = POW2_OP(data001, 4);                                                                     \
+    data010 = POW2_OP(data010, 4);                                                                     \
+    data100 = POW2_OP(data100, 4);                                                                     \
+    data101 = POW2_OP(data101, 4);                                                                     \
+    data11  = POW2_OP(data11, 4);                                                                      \
+    data200 = POW2_OP(data200, 4);                                                                     \
+    data201 = POW2_OP(data201, 4);                                                                     \
+    data21  = POW2_OP(data21, 4);                                                                      \
+    \
+    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
+    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
+    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_max;
+    data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
+
+    for(int i = 0; (start_x + i) < end_x; ++i)
+    {
+        for(int j = 0; (start_y + j) < end_y; ++j)
+        {
+            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+            POOL_OP_float(data_max, data_max, data);
+        }
+    }
+
+    return data_max;
+}
+
+float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_total = 0.0f;
+    for(int i = 0; (start_x + i) < end_x; i++)
+    {
+        for(int j = 0; (start_y + j) < end_y; ++j)
+        {
+            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+            if(isnan(data))
+            {
+                data = 0.0f;
+            }
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data = POW2_OP(data, 1);
+#endif /* defined(POOL_L2) */
+            data_total = data_total + data;
+        }
+    }
+
+    return data_total / float((end_y - start_y) * (end_x - start_x));
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    vec4 res;
+    // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+    POOLING3x3_STRIDE1(res, src, dst);
+#elif STRIDE_X == 2
+    POOLING3x3_STRIDE2(res, src, dst);
+#elif STRIDE_X == 3
+    POOLING3x3_STRIDE3(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+    // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+    int   end_y   = min((start_y + 3), MAX_HEIGHT);
+    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    STORE16(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F32;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    vec4 vdata0;
+    vdata0 = vec4(INITIAL_VALUE);
+    vec4 vdata1;
+    vdata1 = vec4(INITIAL_VALUE);
+    float sdata;
+    sdata = float(INITIAL_VALUE);
+
+    for(int y = 0; y < int(POOL_SIZE); y++)
+    {
+        int x = 0;
+        for(; x <= (int(POOL_SIZE) - 8); x += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
+            LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata0, vdata0, data2);
+            POOL_OP(vdata1, vdata1, data3);
+        }
+
+        // Leftover
+        for(; x < int(POOL_SIZE); ++x)
+        {
+            float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4 *= data4;
+#endif /* defined(POOL_L2) */
+            POOL_OP_float(sdata, sdata, data4);
+        }
+    }
+
+    //Reduce result
+    vec4 reduce4;
+    POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
+    vec2 reduce2;
+    POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
+    float res;
+    POOL_OP_float(res, reduce2.x, reduce2.y);
+    POOL_OP_float(res, res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    {
+        // Divide by pool region in case of average pooling
+        int   start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+        int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int   end_x   = int(min(STRIDE_X + POOL_SIZE, MAX_WIDTH));
+        int   end_y   = int(min(STRIDE_Y + POOL_SIZE, MAX_HEIGHT));
+        float res1    = float((end_y - start_y) * (end_x - start_x));
+        res           = DIV_OP(res, res1);
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+#endif /* POOLING_LAYER_2 */
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+vec2 load_and_unpack(Tensor3D, uint);
+vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD2_fp16(r, name, offset) \
+    r.xy = load_and_unpack(name, offset)
+
+#define LOAD4_fp16(r, name, offset)       \
+    r.xy = load_and_unpack(name, offset); \
+    r.zw = load_and_unpack(name, offset + uint(1))
+
+#define STORE4_fp16(name, offset, r)             \
+    uint datastore1;                             \
+    uint datastore2;                             \
+    datastore1 = uint(packHalf2x16(r.xy));       \
+    datastore2 = uint(packHalf2x16(r.zw));       \
+    STORE1(name, offset << uint(1), datastore1); \
+    STORE1(name, (offset << uint(1)) + uint(1), datastore2)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b)        \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }                             \
+    if(isnan(a.z) || (a.z < b.z)) \
+    {                             \
+        res.z = b.z;              \
+    }                             \
+    if(isnan(a.w) || (a.w < b.w)) \
+    {                             \
+        res.w = b.w;              \
+    }
+#define POOL_OP_float(res, a, b) \
+    (res) = (a);                 \
+    if(isnan(a) || (a < b))      \
+    {                            \
+        res = b;                 \
+    }
+#define POOL_OP_vec2(res, a, b)   \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -65504.0f
+#endif //POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1_fp16(res, input, output)                                                                \
+    vec4 data00;                                                                                                   \
+    vec2 data01;                                                                                                   \
+    vec4 data10;                                                                                                   \
+    vec2 data11;                                                                                                   \
+    vec4 data20;                                                                                                   \
+    vec2 data21;                                                                                                   \
+    LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                        \
+    LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                        \
+    LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                        \
+    data00 = POW2_OP(data00, 4);                                                                                   \
+    data01 = POW2_OP(data01, 2);                                                                                   \
+    data10 = POW2_OP(data10, 4);                                                                                   \
+    data11 = POW2_OP(data11, 2);                                                                                   \
+    data20 = POW2_OP(data20, 4);                                                                                   \
+    data21 = POW2_OP(data21, 2);                                                                                   \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data00.xyzy;                                                                                  \
+    values001.xyzw = data00.zwzw;                                                                                  \
+    values010.x    = data01.x;                                                                                     \
+    values010.y    = data00.w;                                                                                     \
+    values010.zw   = data01.xy;                                                                                    \
+    values100.xyzw = data10.xyzy;                                                                                  \
+    values101.xyzw = data10.zwzw;                                                                                  \
+    values11.x     = data11.x;                                                                                     \
+    values11.y     = data10.w;                                                                                     \
+    values11.zw    = data11.xy;                                                                                    \
+    values200.xyzw = data20.xyzy;                                                                                  \
+    values201.xyzw = data20.zwzw;                                                                                  \
+    values21.x     = data21.x;                                                                                     \
+    values21.y     = data20.w;                                                                                     \
+    values21.zw    = data21.xy;                                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2_fp16(res, input, output)                                                                \
+    vec4  data000;                                                                                                 \
+    vec4  data001;                                                                                                 \
+    float data010;                                                                                                 \
+    vec4  data100;                                                                                                 \
+    vec4  data101;                                                                                                 \
+    float data11;                                                                                                  \
+    vec4  data200;                                                                                                 \
+    vec4  data201;                                                                                                 \
+    float data21;                                                                                                  \
+    vec2  datamiddle0;                                                                                             \
+    vec2  datamiddle1;                                                                                             \
+    vec2  datamiddle2;                                                                                             \
+    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                       \
+    datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));             \
+    data010     = datamiddle0.x;                                                                                   \
+    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                       \
+    datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));             \
+    data11      = datamiddle1.x;                                                                                   \
+    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                       \
+    datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));             \
+    data21      = datamiddle2.x;                                                                                   \
+    data000     = POW2_OP(data000, 4);                                                                             \
+    data001     = POW2_OP(data001, 4);                                                                             \
+    data010     = POW2_OP(data010, 1);                                                                             \
+    data100     = POW2_OP(data100, 4);                                                                             \
+    data101     = POW2_OP(data101, 4);                                                                             \
+    data11      = POW2_OP(data11, 1);                                                                              \
+    data200     = POW2_OP(data200, 4);                                                                             \
+    data201     = POW2_OP(data201, 4);                                                                             \
+    data21      = POW2_OP(data21, 1);                                                                              \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data000.xyzz;                                                                                 \
+    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
+    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
+    values100.xyzw = data100.xyzz;                                                                                 \
+    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
+    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
+    values200.xyzw = data200.xyzz;                                                                                 \
+    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
+    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3_fp16(res, input, output)                                                    \
+    vec4 data000;                                                                                      \
+    vec4 data001;                                                                                      \
+    vec4 data010;                                                                                      \
+    vec4 data100;                                                                                      \
+    vec4 data101;                                                                                      \
+    vec4 data11;                                                                                       \
+    vec4 data200;                                                                                      \
+    vec4 data201;                                                                                      \
+    vec4 data21;                                                                                       \
+    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                     \
+    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));           \
+    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                     \
+    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));            \
+    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                     \
+    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));            \
+    data000 = POW2_OP(data000, 4);                                                                     \
+    data001 = POW2_OP(data001, 4);                                                                     \
+    data010 = POW2_OP(data010, 4);                                                                     \
+    data100 = POW2_OP(data100, 4);                                                                     \
+    data101 = POW2_OP(data101, 4);                                                                     \
+    data11  = POW2_OP(data11, 4);                                                                      \
+    data200 = POW2_OP(data200, 4);                                                                     \
+    data201 = POW2_OP(data201, 4);                                                                     \
+    data21  = POW2_OP(data21, 4);                                                                      \
+    \
+    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
+    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
+    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+vec2 load_and_unpack(Tensor3D src, uint offset)
+{
+    uint packed_s;
+    vec2 s;
+    LOAD1(packed_s, src, offset);
+
+    s = vec2(unpackHalf2x16(packed_s));
+    return s;
+}
+
+vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize maximum
+    vec2 data_max = vec2(0);
+
+    //Load and Set initial maximum1
+    vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
+    data_max.x      = data_init1.x;
+
+    //Load and Set initial maximum2
+    if(end_x1 < upper_bound_w)
+    {
+        if((stride_x % 2) == 0)
+        {
+            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
+            data_max.y      = data_init2.x;
+        }
+        else
+        {
+            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
+            data_max.y      = data_init2.y;
+        }
+    }
+
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            //Calculate maximum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                vec2  data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+                float data_mr1;
+                POOL_OP_float(data_mr1, data1.x, data1.y);
+                POOL_OP_float(data_max.x, data_max.x, data_mr1);
+            }
+            else
+            {
+                vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+                POOL_OP_float(data_max.x, data_max.x, data1.x);
+            }
+
+            //Calculate maximum2
+            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data2.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.x);
+                    }
+                }
+                else
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data3.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.y);
+                    }
+                }
+            }
+        }
+    return data_max;
+}
+
+vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize sum
+    float data_total1 = float(0);
+    float data_total2 = float(0);
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+            //Calculate sum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                data_total1 = data_total1 + data1.x + data1.y;
+            }
+            else
+            {
+                data_total1 = data_total1 + data1.x;
+            }
+
+            //Calculate sum2
+            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data2.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.x;
+                    }
+                }
+                else
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+                    data3 = POW2_OP(data3, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data3.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.y;
+                    }
+                }
+            }
+        }
+    //Calculate average
+    vec2 data_avg;
+    data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
+    data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
+
+    return data_avg;
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    vec4 res;
+    // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+    POOLING3x3_STRIDE1_fp16(res, src, dst);
+#elif STRIDE_X == 2
+    POOLING3x3_STRIDE2_fp16(res, src, dst);
+#elif STRIDE_X == 3
+    POOLING3x3_STRIDE3_fp16(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+    // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+    int   end_y   = min((start_y + 3), MAX_HEIGHT);
+    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F16;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    vec4 vdata00;
+    vdata00 = vec4(INITIAL_VALUE);
+    vec4 vdata01;
+    vdata01 = vec4(INITIAL_VALUE);
+    vec4 vdata10;
+    vdata10 = vec4(INITIAL_VALUE);
+    vec4 vdata11;
+    vdata11 = vec4(INITIAL_VALUE);
+    vec2 sdata;
+    sdata = vec2(INITIAL_VALUE);
+
+    for(int y = 0; y < int(POOL_SIZE); y++)
+    {
+        int x = 0;
+        for(; x <= (int(POOL_SIZE) - 8); x += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata00, vdata00, data2);
+            POOL_OP(vdata10, vdata10, data3);
+        }
+
+        // Leftover
+        for(; x < int(POOL_SIZE); x = x + 2)
+        {
+            vec2 data4middle;
+            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+            if((x + 1) >= int(POOL_SIZE))
+            {
+                POOL_OP_float(sdata.x, sdata.x, data4middle.x);
+            }
+            else
+            {
+                float data4;
+                POOL_OP_float(data4, data4middle.x, data4middle.y);
+                POOL_OP_float(sdata.x, sdata.x, data4);
+            }
+        }
+    }
+
+    for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++)
+    {
+        int x1 = STRIDE_X;
+        for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata01, vdata01, data2);
+            POOL_OP(vdata11, vdata11, data3);
+        }
+
+        // Leftover
+        for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
+        {
+            vec2 data4middle;
+            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+            if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+            {
+                POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+            }
+            else
+            {
+                float data4;
+                POOL_OP_float(data4, data4middle.x, data4middle.y);
+                POOL_OP_float(sdata.y, sdata.y, data4);
+            }
+        }
+    }
+
+    //Reduce result
+    vec4 reduce40;
+    POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
+    vec2 reduce20;
+    POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
+    vec4 reduce41;
+    POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
+    vec2 reduce21;
+    POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
+    vec2 data;
+    POOL_OP_float(data.x, reduce20.x, reduce20.y);
+    POOL_OP_float(data.x, data.x, sdata.x);
+    POOL_OP_float(data.y, reduce21.x, reduce21.y);
+    POOL_OP_float(data.y, data.y, sdata.y);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    {
+        // Divide by pool region in case of average pooling
+        int  start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+        int  start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int  end_x1   = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
+        int  end_y1   = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
+        int  start_x2 = start_x1 + STRIDE_X;
+        int  start_y2 = start_y1;
+        int  end_x2   = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
+        int  end_y2   = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+        vec2 res1;
+        res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
+        res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
+        data.x = DIV_OP(data.x, res1.x);
+        data.y = DIV_OP(data.y, res1.y);
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+    uint res;
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+#endif /*POOLING_LAYER_2*/
+#endif /*DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
new file mode 100644
index 0000000..0bbabea
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs

@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#define MAX_OP(x, y) max((x), (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define SUB_OP(x, y) ((x) - (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define EXP_OP(x) exp((x))
+
+#if defined(DATA_TYPE_FP32)
+const float MINVAL   = -1.0 / 0.0;
+vec4        type_min = CONVERT(MINVAL, vec4);
+
+#define LOAD16(name, offset)            \
+    vec4(LOAD4(name, offset),           \
+         LOAD4(name, offset + uint(1)), \
+         LOAD4(name, offset + uint(2)), \
+         LOAD4(name, offset + uint(3)))
+
+#define STORE16(name, offset, value)         \
+    STORE4(name, offset, value.x);           \
+    STORE4(name, offset + uint(1), value.y); \
+    STORE4(name, offset + uint(2), value.z); \
+    STORE4(name, offset + uint(3), value.w)
+
+#ifdef SOFTMAX_LAYER_MAX
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(max, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+BUFFER_DECLARATION(sum, 4, float, writeonly);
+#elif defined(SOFTMAX_LAYER_NORM)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(sum, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+#endif // SOFTMAX_LAYER_MAX
+
+layout(std140) uniform shader_params
+{
+#ifdef SOFTMAX_LAYER_MAX
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    uint width;
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(max);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(sum);
+    uint width;
+#elif defined(SOFTMAX_LAYER_NORM)
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(sum);
+    TENSOR3D_PARAM_DECLARATION(dst);
+#endif // SOFTMAX_LAYER_MAX
+};
+
+#ifdef SOFTMAX_LAYER_MAX
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+void main(void)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    vec4 max_val = CONVERT(type_min, vec4);
+
+    // Calculate max of row
+    uint width2 = width >> 2;
+    for(int i = 0; i < int(width2); i++)
+    {
+        vec4 data = LOAD16(src, offset(src, i << 2, 0));
+        max_val   = MAX_OP(data, max_val);
+    }
+
+#ifdef NON_MULTIPLE_OF_4
+    // Handle non multiple of 4
+    for(int i = int(width2 << 2); i < int(width); i++)
+    {
+        float data = LOAD4(src, offset(src, i, 0));
+        max_val.x  = MAX_OP(data, max_val.x);
+    }
+#endif /* NON_MULTIPLE_OF_4 */
+
+    // Perform max reduction
+    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+    max_val.x  = MAX_OP(max_val.x, max_val.y);
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), max_val.x);
+}
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+void main(void)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4);
+
+    // Set sum vector
+    vec4 sum1D = CONVERT(0, vec4);
+
+    // Shift values, exp and sum
+    uint width2 = width >> 2;
+    for(int i = 0; i < int(width2); i++)
+    {
+        vec4 data = LOAD16(src, offset(src, i << 2, 0));
+        data      = SUB_OP(data, max_val);
+        data      = EXP_OP(data);
+        STORE16(dst, offset(dst, i << 2, 0), data);
+        sum1D = ADD_OP(sum1D, data);
+    }
+
+#ifdef NON_MULTIPLE_OF_4
+    // Handle non multiple of 4
+    for(int i = int(width2 << 2); i < int(width); i++)
+    {
+        float data;
+        data = LOAD4(src, offset(src, i, 0));
+        data = SUB_OP(data, max_val.x);
+        data = EXP_OP(data);
+        STORE4(dst, offset(dst, i, 0), data);
+        sum1D.x = ADD_OP(sum1D.x, data);
+    }
+#endif                            /* NON_MULTIPLE_OF_4 */
+
+    // Perform min/max reduction
+    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
+
+    // Calculate and store result
+    STORE4(sum, CURRENT_OFFSET(sum), sum1D.x);
+}
+#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4);
+    vec4 data    = LOAD16(src, CURRENT_OFFSET(src));
+    STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val));
+}
+#endif                            // SOFTMAX_LAYER_MAX
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+const float MINVAL1   = -1.0 / 0.0;
+vec4        type_min1 = CONVERT(MINVAL1, vec4);
+
+#define GC_LOAD4_IMAGE(r, name, x, y)  \
+    load_and_unpack(r.xy, name, x, y); \
+    load_and_unpack(r.zw, name, (x + 2), y)
+
+#define GC_STORE4_IMAGE(r, name, x, y)                         \
+    GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \
+    GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y)
+
+#ifdef SOFTMAX_LAYER_MAX
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(max, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+BUFFER_DECLARATION(sum, 4, uint, writeonly);
+#elif defined(SOFTMAX_LAYER_NORM)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(sum, 2, uint, readonly);
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+#endif // SOFTMAX_LAYER_MAX
+
+layout(std140) uniform shader_params
+{
+#ifdef SOFTMAX_LAYER_MAX
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    uint width;
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(max);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(sum);
+    uint width;
+#elif defined(SOFTMAX_LAYER_NORM)
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(sum);
+    TENSOR3D_PARAM_DECLARATION(dst);
+#endif // SOFTMAX_LAYER_MAX
+};
+
+#define load_and_unpack(rs, names, xs, ys)           \
+    do                                               \
+    {                                                \
+        uint packed_s;                               \
+        GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \
+        rs = vec2(unpackHalf2x16(packed_s));         \
+    } while(false)
+
+#ifdef SOFTMAX_LAYER_MAX
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+void main(void)
+{
+    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    vec4 max_val1 = CONVERT(type_min1, vec4);
+
+    // Calculate max of row
+    uint width2 = width >> 2;
+    for(int i = 0; i < int(width2); i++)
+    {
+        vec4 data1;
+        GC_LOAD4_IMAGE(data1, src, (i << 2), 0);
+        max_val1 = MAX_OP(data1, max_val1);
+    }
+
+#ifdef NON_MULTIPLE_OF_4
+    // Handle non multiple of 4
+    for(int i = int(width2 << 2); i < int(width); i = i + 2)
+    {
+        vec2 data;
+        load_and_unpack(data, src, i, 0);
+        max_val1.x = MAX_OP(data.x, max_val1.x);
+        if((i + 1) < int(width))
+        {
+            max_val1.x = MAX_OP(data.y, max_val1.x);
+        }
+    }
+#endif                                     /* NON_MULTIPLE_OF_4 */
+
+    // Perform max reduction
+    max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw);
+    max_val1.x  = MAX_OP(max_val1.x, max_val1.y);
+    vec2 res1   = vec2(max_val1.x, 0.f);
+    uint res;
+    res = uint(packHalf2x16(res1));
+
+    // Store result
+    GC_STORE1_2D_OFFSET(res, dst, 0, 0);
+}
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+void main(void)
+{
+    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+    Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    vec2 datamaxinit;
+    load_and_unpack(datamaxinit, max, 0, 0);
+    vec4 max_val = CONVERT(datamaxinit.x, vec4);
+
+    // Set sum vector
+    vec4 sum1D1 = CONVERT(0.f, vec4);
+
+    // Shift values, exp and sum
+    uint width2 = width >> 2;
+    for(int i = 0; i < int(width2); i++)
+    {
+        vec4 data;
+        GC_LOAD4_IMAGE(data, src, (i << 2), 0);
+        data = SUB_OP(data, max_val);
+        data = EXP_OP(data);
+        GC_STORE4_IMAGE(data, dst, (i << 2), 0);
+        sum1D1 = ADD_OP(sum1D1, data);
+    }
+
+#ifdef NON_MULTIPLE_OF_4
+    // Handle non multiple of 4
+    for(int i = int(width2 << 2); i < int(width); i = i + 2)
+    {
+        vec2  datamiddle;
+        float data1;
+        load_and_unpack(datamiddle, src, i, 0);
+        data1 = SUB_OP(datamiddle.x, max_val.x);
+        data1 = EXP_OP(data1);
+        vec2 datares1;
+        if((i + 1) < int(width))
+        {
+            float data2;
+            data2    = SUB_OP(datamiddle.y, max_val.x);
+            data2    = EXP_OP(data2);
+            datares1 = vec2(data1, data2);
+            data1    = ADD_OP(data2, data1);
+        }
+        else
+        {
+            datares1 = vec2(data1, 0.f);
+        }
+        uint datares;
+        datares = uint(packHalf2x16(datares1));
+        GC_STORE1_2D_OFFSET(datares, dst, i, 0);
+        sum1D1.x = ADD_OP(sum1D1.x, data1);
+    }
+#endif                            /* NON_MULTIPLE_OF_4 */
+
+    // Perform min/max reduction
+    sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw);
+    sum1D1.x  = ADD_OP(sum1D1.x, sum1D1.y);
+    vec2 res1 = vec2(sum1D1.x, 0.f);
+    uint res;
+    res = uint(packHalf2x16(res1));
+    // Calculate and store result
+    GC_STORE1_2D_OFFSET(res, sum, 0, 0);
+}
+#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    vec2 sum1;
+    load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y));
+    vec4 sum_val1 = CONVERT(sum1.x, vec4);
+
+    vec4 data1;
+    GC_LOAD4_IMAGE(data1, src, 0, 0);
+    vec4 res = DIV_OP(data1, sum_val1);
+    GC_STORE4_IMAGE(res, dst, 0, 0);
+}
+#endif                            // SOFTMAX_LAYER_MAX
+#endif                            // DATA_TYPE_FP32
\ No newline at end of file

diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
new file mode 100755
index 0000000..6d020fe
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs

@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+#define LOAD16(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2)); \
+    r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r)         \
+    STORE4(name, offset, r.x);           \
+    STORE4(name, offset + uint(1), r.y); \
+    STORE4(name, offset + uint(2), r.z); \
+    STORE4(name, offset + uint(3), r.w)
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    // Compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load the NxN block at (x, y)
+    vec4 u0;
+    vec4 u1;
+    vec4 u2;
+    vec4 u3;
+    LOAD16(u0, src, offset(src, 0, 0));
+    LOAD16(u1, src, offset(src, 0, 1));
+    LOAD16(u2, src, offset(src, 0, 2));
+    LOAD16(u3, src, offset(src, 0, 3));
+
+    // Transpose the block
+    vec4 tmp;
+    tmp.xyz = u0.yzw;
+    u0.y    = u1.x;
+    u0.z    = u2.x;
+    u0.w    = u3.x;
+    u1.x    = tmp.x;
+    u2.x    = tmp.y;
+    u3.x    = tmp.z;
+    tmp.xy  = u1.zw;
+    u1.z    = u2.y;
+    u1.w    = u3.y;
+    u2.y    = tmp.x;
+    u3.y    = tmp.y;
+    tmp.x   = u2.w;
+    u2.w    = u3.z;
+    u3.z    = tmp.x;
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst.stride_y) + (dst.offset_first_element_in_bytes);
+
+    STORE16(dst, uint((dst_offset_in_bytes + uint(0) * dst.stride_y) >> 2), u0);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(1) * dst.stride_y) >> 2), u1);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(2) * dst.stride_y) >> 2), u2);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(3) * dst.stride_y) >> 2), u3);
+}
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    // Compute source address
+    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load the NxN block at (x, y)
+    vec4  u0;
+    vec4  u1;
+    vec4  u2;
+    vec4  u3;
+    uvec2 packed_s[4];
+    GC_LOAD1_2D_OFFSET(packed_s[0], src, 0, 0);
+    GC_LOAD1_2D_OFFSET(packed_s[1], src, 0, 1);
+    GC_LOAD1_2D_OFFSET(packed_s[2], src, 0, 2);
+    GC_LOAD1_2D_OFFSET(packed_s[3], src, 0, 3);
+    u0 = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u1 = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    u2 = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+    u3 = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+
+    // Transpose the block
+    vec4 tmp;
+    tmp.xyz = u0.yzw;
+    u0.y    = u1.x;
+    u0.z    = u2.x;
+    u0.w    = u3.x;
+    u1.x    = tmp.x;
+    u2.x    = tmp.y;
+    u3.x    = tmp.z;
+    tmp.xy  = u1.zw;
+    u1.z    = u2.y;
+    u1.w    = u3.y;
+    u2.y    = tmp.x;
+    u3.y    = tmp.y;
+    tmp.x   = u2.w;
+    u2.w    = u3.z;
+    u3.z    = tmp.x;
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+
+    packed_s[0] = uvec2(packHalf2x16(u0.xy), packHalf2x16(u0.zw));
+    packed_s[1] = uvec2(packHalf2x16(u1.xy), packHalf2x16(u1.zw));
+    packed_s[2] = uvec2(packHalf2x16(u2.xy), packHalf2x16(u2.zw));
+    packed_s[3] = uvec2(packHalf2x16(u3.xy), packHalf2x16(u3.zw));
+    GC_STORE1(packed_s[0], dst, uint((dst_offset_in_bytes + uint(0) * dst_stride_y) >> 3));
+    GC_STORE1(packed_s[1], dst, uint((dst_offset_in_bytes + uint(1) * dst_stride_y) >> 3));
+    GC_STORE1(packed_s[2], dst, uint((dst_offset_in_bytes + uint(2) * dst_stride_y) >> 3));
+    GC_STORE1(packed_s[3], dst, uint((dst_offset_in_bytes + uint(3) * dst_stride_y) >> 3));
+}
+#endif /*ARM_COMPUTE_ENABLE_FP16*/

diff --git a/src/core/GLES_COMPUTE/egl_entries.in b/src/core/GLES_COMPUTE/egl_entries.in
new file mode 100644
index 0000000..64ccda6
--- /dev/null
+++ b/src/core/GLES_COMPUTE/egl_entries.in

@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+EGL_ENTRY(eglGetProcAddress)
+EGL_ENTRY(eglBindAPI)
+EGL_ENTRY(eglChooseConfig)
+EGL_ENTRY(eglCreateContext)
+EGL_ENTRY(eglDestroyContext)
+EGL_ENTRY(eglGetDisplay)
+EGL_ENTRY(eglInitialize)
+EGL_ENTRY(eglMakeCurrent)
+EGL_ENTRY(eglTerminate)
+EGL_ENTRY(eglGetError)
+EGL_ENTRY(eglQueryString)

diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
new file mode 100644
index 0000000..15ce8ee
--- /dev/null
+++ b/src/core/GLES_COMPUTE/gl_entries.in

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+GL_ENTRY(glAttachShader)
+GL_ENTRY(glCompileShader)
+GL_ENTRY(glCreateProgram)
+GL_ENTRY(glCreateShader)
+GL_ENTRY(glDeleteProgram)
+GL_ENTRY(glDeleteShader)
+GL_ENTRY(glDetachShader)
+GL_ENTRY(glGetProgramInfoLog)
+GL_ENTRY(glGetProgramiv)
+GL_ENTRY(glGetShaderInfoLog)
+GL_ENTRY(glGetShaderiv)
+GL_ENTRY(glLinkProgram)
+GL_ENTRY(glShaderSource)
+GL_ENTRY(glUseProgram)
+GL_ENTRY(glBindBuffer)
+GL_ENTRY(glBindBufferBase)
+GL_ENTRY(glBufferData)
+GL_ENTRY(glDeleteBuffers)
+GL_ENTRY(glDispatchCompute)
+GL_ENTRY(glFlush)
+GL_ENTRY(glGenBuffers)
+GL_ENTRY(glGetProgramResourceIndex)
+GL_ENTRY(glGetUniformLocation)
+GL_ENTRY(glMapBufferRange)
+GL_ENTRY(glMemoryBarrier)
+GL_ENTRY(glUniform1ui)
+GL_ENTRY(glUnmapBuffer)
+GL_ENTRY(glGetError)
+GL_ENTRY(glGetActiveUniformBlockiv)
+GL_ENTRY(glUniformBlockBinding)
+GL_ENTRY(glGetUniformBlockIndex)
+GL_ENTRY(glGenTextures)
+GL_ENTRY(glDeleteTextures)
+GL_ENTRY(glBindTexture)
+GL_ENTRY(glTexImage2D)
+GL_ENTRY(glGenFramebuffers)
+GL_ENTRY(glDeleteFramebuffers)
+GL_ENTRY(glBindFramebuffer)
+GL_ENTRY(glFramebufferTexture2D)

diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000..d76ae8f
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCAbsoluteDifferenceKernel::GCAbsoluteDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void GCAbsoluteDifferenceKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, 4, 1);
+    AccessWindowRectangle input2_access(input2->info(), 0, 0, 4, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, 4, 1);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    _kernel.clear_params();
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCAbsoluteDifferenceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        add_2D_tensor_argument(idx, _input1, binding++, slice);
+        add_2D_tensor_argument(idx, _input2, binding++, slice);
+        add_2D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
new file mode 100644
index 0000000..42433cf
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCActivationLayerKernel::GCActivationLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCActivationLayerKernel::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    _input  = input;
+    _output = input;
+
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        _output = output;
+    }
+
+    unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + string_from_activation_func(act_info.activation())));
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
+    build_opts.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                                  output_access);
+
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    _kernel.clear_params();
+
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCActivationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..9c24d2e
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f)
+{
+}
+
+void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    const unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace(("#define ESPILON " + float_to_string_with_full_precision(_epsilon)));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 1, mean->info()->dimension(1));
+    AccessWindowStatic     var_access(var->info(), 0, 0, var->info()->dimension(0) + 1, var->info()->dimension(1));
+    AccessWindowStatic     beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 1, beta->info()->dimension(1));
+    AccessWindowStatic     gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 1, gamma->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    _kernel.clear_params();
+
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCBatchNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, 3, vector_slice);
+    add_1D_tensor_argument(idx, _var, 4, vector_slice);
+    add_1D_tensor_argument(idx, _beta, 5, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
new file mode 100644
index 0000000..1071623
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+GCCol2ImKernel::GCCol2ImKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
+                               std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _kernel.clear_params();
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    // Create kernel
+    std::set<std::string>  build_opts;
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define COL2IM");
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+    _kernel.set_params(idx++, _convolved_dims.first);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCCol2ImKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_2D();
+    Window slice_out = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        // Set inputs
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_2D_tensor_argument(idx, _input, binding++, slice_in);
+        add_3D_tensor_argument(idx, _output, binding++, slice_out);
+        _kernel.update_shader_params();
+        enqueue(*this, slice_in);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
new file mode 100644
index 0000000..7f9f438
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp

@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCDepthConcatenateKernel::GCDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize GCDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void GCDepthConcatenateKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Add build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
+
+    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
+    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
+
+    unsigned int num_elems_processed_per_iteration = 1;
+    unsigned int num_elems_read_per_iteration      = 1;
+    if(input->info()->data_type() == DataType::F32)
+    {
+        num_elems_processed_per_iteration = 1;
+        num_elems_read_per_iteration      = 1;
+    }
+    else if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+        num_elems_read_per_iteration      = 4;
+    }
+    const unsigned int num_rows_read_per_iteration = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    _kernel.clear_params();
+    _kernel.set_shader_params_binding_point(0);
+    IGCKernel::configure(win);
+}
+
+void GCDepthConcatenateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        if(_input->info()->data_type() == DataType::F32)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, 1, slice);
+            add_3D_tensor_argument(idx, _output, 2, slice);
+        }
+        else if(_input->info()->data_type() == DataType::F16)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
+            add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+        }
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..1fa2a71
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+template <unsigned int kernel_size>
+GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
+    : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1U, 1U, 1U))
+{
+}
+
+template <unsigned int kernel_size>
+BorderSize             GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int kernel_size>
+void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
+    ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        // FIXME: Bug in framework, workaround it in tests currently.
+        //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+    }
+
+    _conv_stride_x = std::get<0>(conv_info.stride());
+    _conv_stride_y = std::get<1>(conv_info.stride());
+    _conv_pad_x    = std::get<0>(conv_info.pad());
+    _conv_pad_y    = std::get<1>(conv_info.pad());
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _bias        = bias;
+    _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
+
+    std::set<std::string> options;
+
+    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+    options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
+
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    options.emplace(("#define " + dt_name));
+
+    unsigned int num_elems_read_per_iteration_x    = kernel_size * _conv_stride_x;
+    unsigned int num_elems_read_per_iteration_y    = 1;
+    unsigned int num_elems_written_per_iteration_x = 1;
+    unsigned int num_elems_written_per_iteration_y = 1;
+    unsigned int num_elems_written_per_iteration_z = 1;
+
+    if(kernel_size == 3)
+    {
+        if((_conv_stride_x == 1) && (_conv_stride_y == 1))
+        {
+            switch(input->info()->data_type())
+            {
+                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
+#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
+
+                case DataType::F16:
+#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 16;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 8;
+                    num_elems_written_per_iteration_y = 3;
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 6;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 4;
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+                    num_elems_written_per_iteration_z = 2;
+#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
+                    break;
+
+                case DataType::F32:
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+                    break;
+
+                default:
+                    ARM_COMPUTE_ERROR("Current data type is not supported");
+                    break;
+            }
+        }
+        // FIXME: Just keep one in release
+        else
+        {
+            switch(input->info()->data_type())
+            {
+                case DataType::F16:
+                    options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_written_per_iteration_x = 4;
+                    break;
+
+                case DataType::F32:
+                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
+#define PROCESS_4_ELEMENT
+
+#if defined(PROCESS_1_ELEMENT)
+                    options.emplace("#define PROCESS_1_ELEMENT");
+                    num_elems_read_per_iteration_x    = 3;
+                    num_elems_written_per_iteration_x = 1;
+#elif defined(PROCESS_4_ELEMENT)
+                    options.emplace("#define PROCESS_4_ELEMENT");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_written_per_iteration_x = 4;
+#elif defined(PROCESS_8_ELEMENT)
+                    options.emplace("#define PROCESS_8_ELEMENT");
+                    num_elems_read_per_iteration_x    = 12;
+                    num_elems_written_per_iteration_x = 8;
+#else /* PROCESS_1_ELEMENT */
+#error Have to declare how many elements to process in one thread.
+#endif /* PROCESS_1_ELEMENT */
+                    break;
+
+                default:
+                    ARM_COMPUTE_ERROR("Current data type is not supported");
+                    break;
+            }
+        }
+    }
+    else if(kernel_size == 1)
+    {
+        switch(input->info()->data_type())
+        {
+            case DataType::F16:
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_written_per_iteration_x = 8;
+                break;
+
+            case DataType::F32:
+                num_elems_read_per_iteration_x    = 1;
+                num_elems_written_per_iteration_x = 1;
+                break;
+
+            default:
+                break;
+        }
+    }
+    else if(kernel_size == 5)
+    {
+        switch(input->info()->data_type())
+        {
+            case DataType::F16:
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_written_per_iteration_x = 4;
+
+            default:
+                break;
+        }
+    }
+    else
+    {
+    }
+
+    if(_bias != nullptr)
+    {
+        options.emplace("#define BIAS");
+    }
+
+    std::stringstream kernel_name;
+    kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    _kernel.clear_params();
+
+    unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor());
+
+    // Calculate output right and bottom border
+    const int output_width          = output->info()->dimension(0);
+    const int output_height         = output->info()->dimension(1);
+    const int output_padding_right  = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
+    const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
+
+    // Calculate input right and bottom border
+    const int input_width    = input->info()->dimension(0);
+    const int input_height   = input->info()->dimension(1);
+    const int upper_bound_w  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
+    const int upper_bound_h  = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
+    const int padding_right  = std::max(upper_bound_w, _conv_pad_x);
+    const int padding_bottom = std::max(upper_bound_h, _conv_pad_y);
+
+    BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
+
+    Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
+
+    AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
+    AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
+
+    switch(weights->info()->data_type())
+    {
+        case DataType::F16:
+            weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size + 1, kernel_size);
+            if(_bias != nullptr)
+            {
+                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0) + 1, 1);
+            }
+            break;
+
+        case DataType::F32:
+            weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size, kernel_size);
+            if(_bias != nullptr)
+            {
+                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0), 1);
+            }
+            break;
+
+        default:
+            ARM_COMPUTE_ERROR("Current data type is not supported");
+            break;
+    }
+
+    AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+
+    if(_bias != nullptr)
+    {
+        update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, weights_access, output_access);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    _kernel.set_params(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
+    _kernel.set_params(idx++, _weights->info()->dimension(2));          // weights_depth
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+template <unsigned int kernel_size>
+void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    // Get initial windows
+    Window slice  = window.first_slice_window_3D();
+    Window win_in = window;
+
+    win_in.adjust(Window::DimX, -_conv_pad_x, true);
+    win_in.adjust(Window::DimY, -_conv_pad_y, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in = win_in.first_slice_window_3D();
+
+    unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+    add_3D_tensor_argument(idx1, _weights, BufferParam(3, 2), slice);
+
+    if(_bias != nullptr)
+    {
+        Window slice_bias;
+        slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _bias, BufferParam(4, 2), slice_bias);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+
+        switch(_input->info()->data_type())
+        {
+            case DataType::F16:
+                switch(kernel_size)
+                {
+                    case 1:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
+                        break;
+
+                    case 3:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+                        break;
+
+                    case 5:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
+                        break;
+                }
+                break;
+
+            case DataType::F32:
+                switch(kernel_size)
+                {
+                    case 1:
+                    case 5:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 2), slice);
+                        break;
+
+                    case 3:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
+                        break;
+                }
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice, _lws);
+    }
+    while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+}
+
+template class arm_compute::GCDirectConvolutionLayerKernel<1>;
+template class arm_compute::GCDirectConvolutionLayerKernel<3>;
+template class arm_compute::GCDirectConvolutionLayerKernel<5>;

diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp
new file mode 100644
index 0000000..6244fbe
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <random>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCDropoutKernel::GCDropoutKernel()
+    : _input(nullptr), _mask(nullptr), _output(nullptr), _num_elems_processed_per_iteration(0)
+{
+}
+
+void GCDropoutKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
+
+    _input  = input;
+    _mask   = mask;
+    _output = output;
+    _kernel.clear_params();
+
+    std::set<std::string>                 build_opts;
+    std::string                           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    std::string                           fporbp  = forward ? "FORWARD" : "BACKWARD";
+    std::random_device                    rd;
+    std::mt19937                          mt(rd());
+    std::uniform_real_distribution<float> dist(0.f, 1.f);
+
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define RATIO " + support::cpp11::to_string(ratio));
+    build_opts.emplace("#define SCALE " + support::cpp11::to_string(1. / (1. - ratio)));
+    build_opts.emplace("#define SEED " + support::cpp11::to_string(dist(mt)));
+    build_opts.emplace("#define " + dt_name);
+    build_opts.emplace("#define " + fporbp);
+
+    _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("dropout", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+    IGCKernel::configure(win);
+}
+
+void GCDropoutKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
+        add_3D_tensor_argument(idx, _mask, BufferParam(2, 2), slice);
+        add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
new file mode 100644
index 0000000..36742ef
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp

@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCFillBorderKernel::GCFillBorderKernel()
+    : IGCKernel(), _tensor(nullptr)
+{
+}
+
+bool GCFillBorderKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <class T>
+void GCFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+    T value;
+    constant_border_value.get(value);
+    _kernel.set_params(idx, static_cast<T>(value));
+}
+
+void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+    border_size.limit(tensor->info()->padding());
+
+    // If there is no border: early exit
+    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+
+    // Select appropriate kernel
+    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define BORDER_SIZE_TOP " + support::cpp11::to_string(border_size.top));
+    build_opts.emplace("#define BORDER_SIZE_BOTTOM " + support::cpp11::to_string(border_size.bottom));
+    build_opts.emplace("#define BORDER_SIZE_LEFT " + support::cpp11::to_string(border_size.left));
+    build_opts.emplace("#define BORDER_SIZE_RIGHT " + support::cpp11::to_string(border_size.right));
+
+    if(border_mode == BorderMode::REPLICATE)
+    {
+        build_opts.emplace("#define FILL_IMAGE_BORDERS_REPLICATE\n");
+    }
+    else
+    {
+        build_opts.emplace("#define FILL_IMAGE_BORDERS_CONSTANT\n");
+    }
+
+    switch(tensor->info()->data_type())
+    {
+        case DataType::F16:
+            build_opts.emplace("#define DATA_TYPE_FP16");
+            break;
+
+        case DataType::F32:
+            build_opts.emplace("#define DATA_TYPE_FP32");
+            break;
+
+        default:
+            ARM_COMPUTE_ERROR("Current data type is not supported");
+            break;
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _tensor = tensor;
+
+    _kernel.clear_params();
+
+    // Create static kernel arguments
+    const unsigned int valid_width       = tensor->info()->valid_region().shape[0];
+    const unsigned int valid_height      = tensor->info()->valid_region().shape[1];
+    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
+    _kernel.set_params(idx++, valid_width);
+    _kernel.set_params(idx++, valid_height);
+    _kernel.set_params(idx++, tensor->info()->valid_region().anchor[0]);
+    _kernel.set_params(idx++, tensor->info()->valid_region().anchor[1]);
+
+    if(BorderMode::CONSTANT == border_mode)
+    {
+        set_constant_border<float>(idx++, constant_border_value);
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
+
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCFillBorderKernel::run(const Window &window)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel.get_program() == 0)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _tensor, 1, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000..5e3788a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, input->info()->dimension(0) * 4);
+    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_INTERLEAVE4x4");
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts));
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input->info()->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    _kernel.clear_params();
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMInterleave4x4Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    /*
+     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     */
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = window.first_slice_window_2D();
+
+    // Change x and y steps for the slide of output tensor
+    out_slice.scale(Window::DimX, 4.f);
+    out_slice.scale(Window::DimY, 0.25f);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, in_slice);
+        add_2D_tensor_argument(idx, _output, 2, out_slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000..434070a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp

@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_ACCUMULATE_BIASES");
+    std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts);
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 1;
+
+    if(_accum->info()->data_type() == DataType::F32)
+    {
+        num_elems_processed_per_iteration = 16;
+    }
+    else if(_accum->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+    }
+
+    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
+    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, biases_access, accum_access);
+
+    _kernel.clear_params();
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window accum_slice = window.first_slice_window_2D();
+
+    Window biases_slice(accum_slice);
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        if(_accum->info()->data_type() == DataType::F32)
+        {
+            add_2D_tensor_argument(idx, _accum, 1, accum_slice);
+            add_1D_tensor_argument(idx, _biases, 2, biases_slice);
+        }
+        else if(_accum->info()->data_type() == DataType::F16)
+        {
+            add_2D_tensor_argument(idx, _accum, BufferParam(1, 3), accum_slice);
+            add_1D_tensor_argument(idx, _biases, BufferParam(2, 3), biases_slice);
+        }
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, accum_slice);
+    }
+    while(window.slide_window_slice_2D(accum_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000..fa04152
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp

@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMMatrixAdditionKernel::configure(const IGCTensor *input, IGCTensor *output, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_gc_vector_width / data_size_from_type(input->info()->data_type());
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define BETA " + float_to_string_with_full_precision(beta));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_MATRIXADDITION");
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    _kernel                    = GCKernelLibrary::get().create_kernel(("gemm_ma"), build_opts);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    _kernel.clear_params();
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_2D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..ea9b387
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp

@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    Window                win;
+
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0)));
+    build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha));
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(is_interleaved_transposed)
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F16:
+                build_opts.emplace("#define DATA_TYPE_FP16");
+                break;
+
+            case DataType::F32:
+                build_opts.emplace("#define DATA_TYPE_FP32");
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
+
+        // Create kernel
+        _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts);
+
+        // Configure window kernel
+        const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+        unsigned int num_elems_processed_per_iteration_x;
+        unsigned int num_elems_processed_per_iteration_y;
+
+        switch(input0->info()->data_type())
+        {
+            case DataType::F16:
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = 1;
+                build_opts.emplace("#define DATA_TYPE_FP16");
+                break;
+
+            case DataType::F32:
+                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define DATA_TYPE_FP32");
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+
+        // Create kernel
+        _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts);
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
+                                         num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+    }
+
+    _kernel.clear_params();
+    _kernel.set_shader_params_binding_point(0);
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F16:
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+                break;
+
+            case DataType::F32:
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 2), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000..a1270b4
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const int          scale_x                           = num_elems_processed_per_iteration;
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     */
+    // Create kernel
+    build_opts.emplace("#define GEMM_TRANSPOSE1xW");
+    _kernel = GCKernelLibrary::get().create_kernel("gemm_transpose1x4", build_opts);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+
+    _kernel.clear_params();
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMTranspose1xWKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, in_slice);
+        add_2D_tensor_argument(idx, _output, 2, out_slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
new file mode 100644
index 0000000..935d842
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp

@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCIm2ColKernel::GCIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_UNUSED(kernel_dims);
+
+    _input  = input;
+    _output = output;
+    _kernel.clear_params();
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define " + dt_name);
+
+    if(has_bias)
+    {
+        build_opts.emplace("#define HAS_BIAS");
+    }
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                    input->info()->tensor_shape().cend(),
+                                                    output->info()->tensor_shape().cbegin() + 1))
+                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    if(!run_img2col_reduced)
+    {
+        // this path is currently not used and not validated
+        build_opts.insert("#define IM2COL_GENERIC");
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.first, kernel_dims.second,
+                                            conv_info);
+        _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+        build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.first));
+        build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.second));
+        build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2)));
+        build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first));
+        build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second));
+        build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first));
+        build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second));
+        build_opts.emplace("#define PAD_X " + support::cpp11::to_string(conv_info.pad().first));
+        build_opts.emplace("#define PAD_Y " + support::cpp11::to_string(conv_info.pad().second));
+        build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
+        build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
+
+        // Create kernel
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+        _run_func = &GCIm2ColKernel::run_generic;
+    }
+    else
+    {
+        build_opts.insert("#define IM2COL_REDUCED");
+        _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+        // Create kernel
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+
+        _run_func = &GCIm2ColKernel::run_reduced;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    if(input->info()->data_type() == DataType::F16)
+    {
+        // Calculate input right and bottom border
+        AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+
+        // Calculate output right and bottom border
+        const int          output_width         = output->info()->dimension(0);
+        const int          output_height        = output->info()->dimension(1);
+        const int          output_padding_right = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
+        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height);
+
+        update_window_and_padding(win, input_access, output_access);
+    }
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    if(!run_img2col_reduced)
+    {
+        // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+        win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+    }
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+    IGCKernel::configure(win);
+}
+
+void GCIm2ColKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+    (this->*_run_func)(window);
+}
+
+void GCIm2ColKernel::run_generic(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    // Get initial windows
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    // Change the Z dimension's step back to 1
+    window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+    Window slice     = window_collapsed.first_slice_window_3D();
+    Window slice_in  = window_collapsed.first_slice_window_3D();
+    Window slice_out = window_collapsed.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_2D_tensor_argument(idx, _output, 2, slice_out);
+
+        _kernel.set_params(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
+        _kernel.set_params(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+        _kernel.set_params(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+}
+
+void GCIm2ColKernel::run_reduced(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, 1, in_slice);
+        add_1D_tensor_argument(idx, _output, 2, out_slice);
+        _kernel.set_params(idx++, _input->info()->dimension(0));
+        _kernel.set_params(idx++, _input->info()->dimension(1));
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..65e54f5
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+GCNormalizationLayerKernel::GCNormalizationLayerKernel()
+    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize GCNormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void GCNormalizationLayerKernel::configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    _input         = input;
+    _squared_input = squared_input;
+    _output        = output;
+
+    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    // Set kernel static arguments
+    std::string func_name = ((norm_info.type() == NormType::IN_MAP_1D) ? "IN_MAP_1D" : "CROSS_MAP");
+    build_opts.emplace(("#define " + func_name));
+    build_opts.emplace(("#define COEFF " + float_to_string_with_full_precision(norm_info.scale_coeff())));
+    build_opts.emplace(("#define BETA " + float_to_string_with_full_precision(norm_info.beta())));
+    build_opts.emplace(("#define KAPPA " + float_to_string_with_full_precision(norm_info.kappa())));
+    build_opts.emplace(("#define RADIUS " + support::cpp11::to_string(norm_info.norm_size() / 2)));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalization_layer", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    _kernel.clear_params();
+
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _squared_input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000..2b5cee4
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+using namespace arm_compute;
+
+GCPixelWiseMultiplicationKernel::GCPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string data_type;
+    std::string compute_type;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    build_opts.emplace("#define SCALE " + support::cpp11::to_string(scale));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pixelwise_mul_float", build_opts));
+
+    _kernel.clear_params();
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCPixelWiseMultiplicationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input1, binding++, slice);
+        add_3D_tensor_argument(idx, _input2, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
new file mode 100644
index 0000000..c877da3
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp

@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCPoolingLayerKernel::GCPoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+{
+}
+
+BorderSize GCPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    const PoolingType   pool_type       = pool_info.pool_type();
+    const int           pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type()));
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pool_info.pad_stride_info());
+
+    // Output auto initialization if not yet initialized
+    {
+        TensorShape output_shape{ input->info()->tensor_shape() };
+        output_shape.set(0, pooled_w);
+        output_shape.set(1, pooled_h);
+
+        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
+
+    // Set instance variables
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _border_size = BorderSize(pool_pad_y, pool_pad_x);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    if(input->info()->data_type() == DataType::F32)
+    {
+        build_opts.insert("#define DATA_TYPE_FP32");
+    }
+    else
+    {
+        build_opts.insert("#define DATA_TYPE_FP16");
+    }
+    build_opts.emplace(("#define POOL_" + string_from_pooling_type(pool_type)));
+    build_opts.emplace(("#define STRIDE_X " + support::cpp11::to_string(pool_stride_x)));
+    build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
+    build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
+    build_opts.emplace(("#define STRIDE_Y " + support::cpp11::to_string(pool_stride_y)));
+    build_opts.emplace(("#define PAD_X " + support::cpp11::to_string(pool_pad_x)));
+    build_opts.emplace(("#define PAD_Y " + support::cpp11::to_string(pool_pad_y)));
+
+    // Create kernel
+    if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
+    {
+        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
+        // each thread computes 4 output elements
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+
+        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+
+        if(input->info()->data_type() == DataType::F32)
+        {
+            if(is_pool3x3_stride_le3)
+            {
+                // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+                _num_elems_processed_per_iteration = 4;
+                num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
+            }
+        }
+        else
+        {
+            num_elements_read_per_iteration = pool_size;
+            if(is_pool3x3_stride_le3)
+            {
+                _num_elems_processed_per_iteration = 4;
+            }
+            else
+            {
+                _num_elems_processed_per_iteration = 2;
+            }
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
+        if(is_pool3x3_stride_le3)
+        {
+            build_opts.insert("#define POOLING_LAYER_3_OPTIMIZED");
+            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
+        }
+        else
+        {
+            build_opts.insert("#define POOLING_LAYER_" + support::cpp11::to_string(pool_size));
+            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
+        }
+    }
+    else // Run general case
+    {
+        if(input->info()->data_type() == DataType::F32)
+        {
+            _num_elems_processed_per_iteration = 1;
+        }
+        else
+        {
+            _num_elems_processed_per_iteration = 2;
+        }
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        build_opts.emplace(("#define POOL_SIZE " + support::cpp11::to_string(pool_size)));
+
+        build_opts.insert("#define POOLING_LAYER_N");
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pooling_layer_n", build_opts));
+    }
+
+    Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+
+    if(input->info()->data_type() == DataType::F32)
+    {
+        AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+        AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+    else
+    {
+        // Calculate output right and bottom border
+        const int output_width          = output->info()->dimension(0);
+        const int output_height         = output->info()->dimension(1);
+        const int output_padding_right  = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
+        const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
+        const int input_padding_right   = ceil_to_multiple(input_width + 2 * _border_size.right, _num_elems_processed_per_iteration) - (input_width + 2 * _border_size.right);
+        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * _border_size.bottom, 1) - (input_height + 2 * _border_size.bottom);
+
+        // Configure kernel window
+        AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right + input_padding_right, input_height + _border_size.bottom + input_padding_bottom);
+        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+
+    _kernel.clear_params();
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCPoolingLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    _kernel.use();
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    do
+    {
+        // Upsample input by pool size
+        Window in_slice(slice);
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, in_slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000..09a0f79
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp

@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Softmax across the x dimension
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(0, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_MAX");
+
+    // Tell the kernel that the width is not a multiple of 4
+    if((input->info()->dimension(0) % 4) != 0)
+    {
+        build_opts.insert("#define NON_MULTIPLE_OF_4");
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+    _kernel.clear_params();
+
+    // Set fixed arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.set_params(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    // The kernel loops over all elements in steps of 4
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    unsigned int       num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCLogits1DMaxKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx1 = 0;
+        switch(_input->info()->data_type())
+        {
+            case DataType::F16:
+                add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice);
+                break;
+
+            case DataType::F32:
+                add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is mot supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+
+GCLogits1DShiftExpSumKernel::GCLogits1DShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM");
+
+    // Tell the kernel that the width is not a multiple of 4
+    if((input->info()->dimension(0) % 4) != 0)
+    {
+        build_opts.insert("#define NON_MULTIPLE_OF_4");
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+    _kernel.clear_params();
+
+    // Set fixed arguments
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.set_params(idx++, input->info()->dimension(0));
+
+    // Configure window
+    // The kernel loops over all elements in steps of 4
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    unsigned int       num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCLogits1DShiftExpSumKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx = 0;
+        switch(_input->info()->data_type())
+        {
+            case DataType::F16:
+                add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice);
+                add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+                add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice);
+                break;
+
+            case DataType::F32:
+                add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice);
+                add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+                add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is mot supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+
+GCLogits1DNormKernel::GCLogits1DNormKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_NORM");
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    unsigned int           num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, num_elems_written_per_iteration, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    _kernel.clear_params();
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCLogits1DNormKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        unsigned int idx1 = 0;
+        switch(_input->info()->data_type())
+        {
+            case DataType::F16:
+                add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice);
+                add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice);
+                break;
+
+            case DataType::F32:
+                add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
+                add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice);
+                add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is mot supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
new file mode 100644
index 0000000..b891b42
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
+
+    _kernel.clear_params();
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    // set shader params binding point
+    _kernel.set_shader_params_binding_point(0);
+
+    IGCKernel::configure(win);
+}
+
+void GCTransposeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        if(_input->info()->data_type() == DataType::F32)
+        {
+            add_2D_tensor_argument(idx, _input, 1, slice);
+            add_2D_tensor_argument(idx, _output, 2, slice);
+        }
+        else if(_input->info()->data_type() == DataType::F16)
+        {
+            add_2D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
+            add_2D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
commit	7068f9900d136312318ff430aef588b14e0c87ad	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Thu Oct 26 15:23:08 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Nov 02 16:35:24 2018 +0000
tree	b57ca81231860f1d8755e6f18e5be7c959fb60c6
parent	d60737592736715dcfd0520535c48190d4ac77d2 [diff]