blob: daa5fd5ab906506418bb25e9f7e5a73a86dacf51 [file] [log] [blame]
/*
* Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
{
ITensor *src{ nullptr }; // SRC_0
ITensor *dst{ nullptr }; // DST_0
const ITensor *weights
{
nullptr
}; // SRC_1
const ITensor *biases
{
nullptr
}; // SRC_2
Tensor permuted_input{}; // INT_0
Tensor permuted_weights{}; // INT_1
Tensor permuted_output{}; // INT_2
Tensor workspace{}; // INT_3
Tensor packed_weights{}; // INT_4
std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
bool is_prepared{ false };
bool permute{ false };
};
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _impl(std::make_unique<Impl>())
{
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
const ITensor *weights,
const ITensor *biases,
ITensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
bool is_nhwc = input->info()->data_layout() == DataLayout::NCHW;
_impl->src = input;
_impl->weights = weights;
_impl->biases = biases;
_impl->dst = output;
_impl->permute = is_nhwc;
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
_impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
_impl->dst->info(), info);
// Configure pipeline
ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
if(!is_activationlayer_enabled)
{
act_info_to_use = act_info;
}
info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
if(is_nhwc)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
auto permute_output = std::make_unique<cpu::CpuPermute>();
_memory_group.manage(&_impl->permuted_input);
_memory_group.manage(&_impl->permuted_weights);
_memory_group.manage(&_impl->permuted_output);
// Configure the function to transform the input tensor from NCHW -> NHWC
permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
_impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
// Configure the function to transform the weights tensor from IHW -> HWI
permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
_impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
// Configure optimized depthwise
dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
_impl->permuted_input.allocator()->allocate();
_impl->permuted_output.allocator()->allocate();
}
else
{
dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
}
// Allocate memory based on the internal memory requirements
experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
_impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
_impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
_memory_group.manage(&_impl->workspace);
_memory_group.manage(&_impl->packed_weights);
_impl->workspace.allocator()->allocate();
_impl->packed_weights.allocator()->allocate();
}
Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *output,
const PadStrideInfo &conv_info,
unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
{
ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
{
prepare();
MemoryGroupResourceScope scope_mg(_memory_group);
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
_impl->op->run(pack);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
{
if(!_impl->is_prepared)
{
// Permute weights
if(_impl->permute)
{
_impl->permuted_weights.allocator()->allocate();
}
if(!_impl->permuted_weights.is_used())
{
_impl->permuted_weights.allocator()->free();
}
_impl->is_prepared = true;
}
}
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
{
Tensor permuted_input{};
Tensor permuted_weights{};
Tensor permuted_output{};
bool is_prepared{ false };
bool is_nchw{ false };
bool is_activationlayer_enabled{ false };
const ITensor *weights{ nullptr };
const ITensor *biases{ nullptr };
const ITensor *src{ nullptr };
ITensor *dst{ nullptr };
std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
};
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
: _impl(std::make_unique<Impl>())
{
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
output->info(), conv_info, depth_multiplier, act_info, dilation));
const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
_impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
_impl->src = input;
_impl->dst = output;
_impl->weights = weights;
_impl->biases = biases;
_impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW;
_impl->is_prepared = !_impl->is_nchw;
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = output;
if(_impl->is_nchw)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
_impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
input_to_use = &_impl->permuted_input;
permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
weights_to_use = &_impl->permuted_weights;
_impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
output_to_use = &_impl->permuted_output;
}
auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
if(_impl->is_nchw)
{
auto permute_output = std::make_unique<cpu::CpuPermute>();
permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
_impl->permuted_input.allocator()->allocate();
_impl->permuted_weights.allocator()->allocate();
_impl->permuted_output.allocator()->allocate();
}
}
Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
_impl->op->run(pack);
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
{
}
#ifndef DOXYGEN_SKIP_THIS
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
{
DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
NEDepthwiseConvolutionLayerGeneric func_generic{};
std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
};
#endif // DOXYGEN_SKIP_THIS
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
const ActivationLayerInfo &act_info, const Size2D &dilation)
{
const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
_impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>();
_impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
info);
switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
break;
case DepthwiseConvolutionFunction::GENERIC:
_impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
break;
default:
ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
}
}
Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::run()
{
switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.run();
break;
case DepthwiseConvolutionFunction::GENERIC:
_impl->func_generic.run();
break;
default:
ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
}
}
void NEDepthwiseConvolutionLayer::prepare()
{
switch(_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.prepare();
break;
case DepthwiseConvolutionFunction::GENERIC:
_impl->func_generic.prepare();
break;
default:
ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
}
}
} // namespace arm_compute