src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"

 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"

 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;

 namespace arm_compute
 {
 NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;

 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
 {
     ITensor       *src{ nullptr }; // SRC_0
     ITensor       *dst{ nullptr }; // DST_0
     const ITensor *weights
     {
         nullptr
     }; // SRC_1
     const ITensor *biases
     {
         nullptr
     };                                                           // SRC_2
     Tensor                                   permuted_input{};   // INT_0
     Tensor                                   permuted_weights{}; // INT_1
     Tensor                                   permuted_output{};  // INT_2
     Tensor                                   workspace{};        // INT_3
     Tensor                                   packed_weights{};   // INT_4
     std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
     bool                                     is_prepared{ false };
     bool                                     permute{ false };
 };

 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }

 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
                                                                                           const ITensor *weights,
                                                                                           const ITensor *biases,
                                                                                           ITensor *output, const PadStrideInfo &conv_info,
                                                                                           unsigned int               depth_multiplier,
                                                                                           const ActivationLayerInfo &act_info,
                                                                                           const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

     bool is_nhwc   = input->info()->data_layout() == DataLayout::NCHW;
     _impl->src     = input;
     _impl->weights = weights;
     _impl->biases  = biases;
     _impl->dst     = output;
     _impl->permute = is_nhwc;

     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
                          _impl->dst->info(), info);

     // Configure pipeline
     ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
     const bool          is_relu                    = arm_compute::utils::info_helpers::is_relu(act_info);
     const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
     bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);

     if(!is_activationlayer_enabled)
     {
         act_info_to_use = act_info;
     }
     info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };

     auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();

     if(is_nhwc)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
         auto permute_output  = std::make_unique<cpu::CpuPermute>();

         _memory_group.manage(&_impl->permuted_input);
         _memory_group.manage(&_impl->permuted_weights);
         _memory_group.manage(&_impl->permuted_output);

         // Configure the function to transform the input tensor from NCHW -> NHWC
         permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
         _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);

         // Configure the function to transform the weights tensor from IHW -> HWI
         permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);

         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
         _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());

         // Configure optimized depthwise
         dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);

         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));

         _impl->permuted_input.allocator()->allocate();
         _impl->permuted_output.allocator()->allocate();
     }
     else
     {
         dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }

     // Allocate memory based on the internal memory requirements
     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
     _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
     _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
     _memory_group.manage(&_impl->workspace);
     _memory_group.manage(&_impl->packed_weights);
     _impl->workspace.allocator()->allocate();
     _impl->packed_weights.allocator()->allocate();
 }

 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
                                                                                            const ITensorInfo         *weights,
                                                                                            const ITensorInfo         *biases,
                                                                                            const ITensorInfo         *output,
                                                                                            const PadStrideInfo       &conv_info,
                                                                                            unsigned int               depth_multiplier,
                                                                                            const ActivationLayerInfo &act_info,
                                                                                            const Size2D              &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }

 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
     prepare();
     MemoryGroupResourceScope scope_mg(_memory_group);

     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
     pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
     pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
     pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
     pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
     pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
     pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
     pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
     pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);

     _impl->op->run(pack);
 }

 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
     if(!_impl->is_prepared)
     {
         // Permute weights
         if(_impl->permute)
         {
             _impl->permuted_weights.allocator()->allocate();
         }

         if(!_impl->permuted_weights.is_used())
         {
             _impl->permuted_weights.allocator()->free();
         }

         _impl->is_prepared = true;
     }
 }

 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
 {
     Tensor                                   permuted_input{};
     Tensor                                   permuted_weights{};
     Tensor                                   permuted_output{};
     bool                                     is_prepared{ false };
     bool                                     is_nchw{ false };
     bool                                     is_activationlayer_enabled{ false };
     const ITensor                           *weights{ nullptr };
     const ITensor                           *biases{ nullptr };
     const ITensor                           *src{ nullptr };
     ITensor                                 *dst{ nullptr };
     std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
 };

 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
     : _impl(std::make_unique<Impl>())
 {
 }

 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));

     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);

     _impl->src         = input;
     _impl->dst         = output;
     _impl->weights     = weights;
     _impl->biases      = biases;
     _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
     _impl->is_prepared = !_impl->is_nchw;

     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
     if(_impl->is_nchw)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();

         permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
         _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
         input_to_use = &_impl->permuted_input;

         permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
         weights_to_use = &_impl->permuted_weights;

         _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
         output_to_use = &_impl->permuted_output;
     }

     auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);

     if(_impl->is_nchw)
     {
         auto permute_output = std::make_unique<cpu::CpuPermute>();
         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);

         _impl->permuted_input.allocator()->allocate();
         _impl->permuted_weights.allocator()->allocate();
         _impl->permuted_output.allocator()->allocate();
     }
 }

 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
                                                                                  const PadStrideInfo &conv_info,
                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }

 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
     pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
     pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
     pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
     pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
     pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
     pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);

     _impl->op->run(pack);
 }

 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
 {
 }

 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
     DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
     NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
     NEDepthwiseConvolutionLayerGeneric           func_generic{};
     std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
 };
 #endif // DOXYGEN_SKIP_THIS

 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
     _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
                                                                           info);
     switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
             _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }

 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }

 void NEDepthwiseConvolutionLayer::run()
 {
     switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.run();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
             _impl->func_generic.run();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
     }
 }

 void NEDepthwiseConvolutionLayer::prepare()
 {
     switch(_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.prepare();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
             _impl->func_generic.prepare();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
     }
 }
 } // namespace arm_compute
	/*
	* Copyright (c) 2017-2021 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"

	#include "arm_compute/core/utils/misc/InfoHelpers.h"
	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
	#include "arm_compute/runtime/NEON/NEScheduler.h"
	#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"

	using namespace arm_compute::misc;
	using namespace arm_compute::misc::shape_calculator;

	namespace arm_compute
	{
	NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;

	struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
	{
	ITensor *src{ nullptr }; // SRC_0
	ITensor *dst{ nullptr }; // DST_0
	const ITensor *weights
	{
	nullptr
	}; // SRC_1
	const ITensor *biases
	{
	nullptr
	}; // SRC_2
	Tensor permuted_input{}; // INT_0
	Tensor permuted_weights{}; // INT_1
	Tensor permuted_output{}; // INT_2
	Tensor workspace{}; // INT_3
	Tensor packed_weights{}; // INT_4
	std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
	bool is_prepared{ false };
	bool permute{ false };
	};

	NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
	: _memory_group(memory_manager), _impl(std::make_unique<Impl>())
	{
	}

	void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
	const ITensor *weights,
	const ITensor *biases,
	ITensor *output, const PadStrideInfo &conv_info,
	unsigned int depth_multiplier,
	const ActivationLayerInfo &act_info,
	const Size2D &dilation)
	{
	ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

	bool is_nhwc = input->info()->data_layout() == DataLayout::NCHW;
	_impl->src = input;
	_impl->weights = weights;
	_impl->biases = biases;
	_impl->dst = output;
	_impl->permute = is_nhwc;

	_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
	ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	_impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
	_impl->dst->info(), info);

	// Configure pipeline
	ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
	const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
	const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
	bool is_activationlayer_enabled = act_info.enabled() && !(is_relu \|\| is_relu6);

	if(!is_activationlayer_enabled)
	{
	act_info_to_use = act_info;
	}
	info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };

	auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();

	if(is_nhwc)
	{
	auto permute_input = std::make_unique<cpu::CpuPermute>();
	auto permute_weights = std::make_unique<cpu::CpuPermute>();
	auto permute_output = std::make_unique<cpu::CpuPermute>();

	_memory_group.manage(&_impl->permuted_input);
	_memory_group.manage(&_impl->permuted_weights);
	_memory_group.manage(&_impl->permuted_output);

	// Configure the function to transform the input tensor from NCHW -> NHWC
	permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
	_impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);

	// Configure the function to transform the weights tensor from IHW -> HWI
	permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
	_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);

	_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
	_impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());

	// Configure optimized depthwise
	dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);

	// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
	_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
	permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));

	_impl->permuted_input.allocator()->allocate();
	_impl->permuted_output.allocator()->allocate();
	}
	else
	{
	dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
	}

	// Allocate memory based on the internal memory requirements
	experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
	_impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
	_impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
	_memory_group.manage(&_impl->workspace);
	_memory_group.manage(&_impl->packed_weights);
	_impl->workspace.allocator()->allocate();
	_impl->packed_weights.allocator()->allocate();
	}

	Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
	const ITensorInfo *weights,
	const ITensorInfo *biases,
	const ITensorInfo *output,
	const PadStrideInfo &conv_info,
	unsigned int depth_multiplier,
	const ActivationLayerInfo &act_info,
	const Size2D &dilation)
	{
	ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
	}

	void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
	{
	prepare();
	MemoryGroupResourceScope scope_mg(_memory_group);

	ITensorPack pack;
	pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
	pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
	pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
	pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
	pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
	pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
	pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
	pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
	pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);

	_impl->op->run(pack);
	}

	void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
	{
	if(!_impl->is_prepared)
	{
	// Permute weights
	if(_impl->permute)
	{
	_impl->permuted_weights.allocator()->allocate();
	}

	if(!_impl->permuted_weights.is_used())
	{
	_impl->permuted_weights.allocator()->free();
	}

	_impl->is_prepared = true;
	}
	}

	struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
	{
	Tensor permuted_input{};
	Tensor permuted_weights{};
	Tensor permuted_output{};
	bool is_prepared{ false };
	bool is_nchw{ false };
	bool is_activationlayer_enabled{ false };
	const ITensor *weights{ nullptr };
	const ITensor *biases{ nullptr };
	const ITensor *src{ nullptr };
	ITensor *dst{ nullptr };
	std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
	};

	NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
	: _impl(std::make_unique<Impl>())
	{
	}

	void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor input, const ITensor weights, const ITensor biases, ITensor output, const PadStrideInfo &conv_info,
	unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
	{
	ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
	ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
	output->info(), conv_info, depth_multiplier, act_info, dilation));

	const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
	_impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);

	_impl->src = input;
	_impl->dst = output;
	_impl->weights = weights;
	_impl->biases = biases;
	_impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW;
	_impl->is_prepared = !_impl->is_nchw;

	ITensor *input_to_use = input;
	const ITensor *weights_to_use = weights;
	ITensor *output_to_use = output;
	if(_impl->is_nchw)
	{
	auto permute_input = std::make_unique<cpu::CpuPermute>();
	auto permute_weights = std::make_unique<cpu::CpuPermute>();

	permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
	_impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
	input_to_use = &_impl->permuted_input;

	permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
	_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
	weights_to_use = &_impl->permuted_weights;

	_impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
	output_to_use = &_impl->permuted_output;
	}

	auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
	depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);

	if(_impl->is_nchw)
	{
	auto permute_output = std::make_unique<cpu::CpuPermute>();
	permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
	_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);

	_impl->permuted_input.allocator()->allocate();
	_impl->permuted_weights.allocator()->allocate();
	_impl->permuted_output.allocator()->allocate();
	}
	}

	Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo input, const ITensorInfo weights, const ITensorInfo biases, const ITensorInfo output,
	const PadStrideInfo &conv_info,
	unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
	{
	ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
	}

	void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
	{
	ITensorPack pack;
	pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
	pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
	pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
	pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
	pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
	pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
	pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);

	_impl->op->run(pack);
	}

	NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
	: _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
	{
	}

	#ifndef DOXYGEN_SKIP_THIS
	struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
	{
	DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
	NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
	NEDepthwiseConvolutionLayerGeneric func_generic{};
	std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
	};
	#endif // DOXYGEN_SKIP_THIS

	void NEDepthwiseConvolutionLayer::configure(ITensor input, const ITensor weights, const ITensor biases, ITensor output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
	const ActivationLayerInfo &act_info, const Size2D &dilation)
	{
	const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	_impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>();
	_impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
	info);
	switch(_impl->depth_conv_func)
	{
	case DepthwiseConvolutionFunction::OPTIMIZED:
	_impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
	break;
	case DepthwiseConvolutionFunction::GENERIC:
	_impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
	break;
	default:
	ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
	}
	}

	Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo input, const ITensorInfo weights, const ITensorInfo biases, const ITensorInfo output, const PadStrideInfo &conv_info,
	unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
	{
	ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
	return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
	}

	void NEDepthwiseConvolutionLayer::run()
	{
	switch(_impl->depth_conv_func)
	{
	case DepthwiseConvolutionFunction::OPTIMIZED:
	_impl->func_optimized.run();
	break;
	case DepthwiseConvolutionFunction::GENERIC:
	_impl->func_generic.run();
	break;
	default:
	ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
	}
	}

	void NEDepthwiseConvolutionLayer::prepare()
	{
	switch(_impl->depth_conv_func)
	{
	case DepthwiseConvolutionFunction::OPTIMIZED:
	_impl->func_optimized.prepare();
	break;
	case DepthwiseConvolutionFunction::GENERIC:
	_impl->func_generic.prepare();
	break;
	default:
	ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
	}
	}
	} // namespace arm_compute