src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #pragma once

 #include <assert.h>

 #include <algorithm>

 #include "arm_gemm.hpp"
 #include "utils.hpp"

 #include "mergeresults.hpp"
 #include "transform.hpp"

 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
 #endif

 namespace arm_gemm {

 // Implementation of the GemmCommon abstract class.
 template<typename strategy, typename To, typename Tr>
 class GemmHybrid : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;

     /* const properties set by constructor */
     const CPUInfo * const _ci;

     const unsigned int _Msize;
     const unsigned int _Nsize;
     const unsigned int _Ksize;

     const unsigned int _nbatches;
     const unsigned int _nmulti;

     const bool _trB;

     const Tr _beta;

     /* Blocking info */
     unsigned int _k_block=0;
     unsigned int _x_block=0;
     unsigned int _Mround=0;

     /* Pretransposed buffer. */
     const Toi *_B_transposed=nullptr;

     unsigned int _B_per_multi = 0;

     /* We will need to walk through the blocks of B in a few contexts, so
      * factor that out.  */
     class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
         const GemmHybrid<strategy, To, Tr> &_parent;

         /* K, X and multi parameters for current iteration. */
         unsigned int _k0=0, _x0=0;

         unsigned int _index=0;
         bool _done=false;
         bool _newkblock=true;

     public:
         blockwalker(const GemmHybrid<strategy, To, Tr> &parent) : _parent(parent) { }

         unsigned int xmax() {
             return std::min(_x0 + _parent._x_block, _parent._Nsize);
         }

         unsigned int kmax() {
             return std::min(_k0 + _parent._k_block, _parent._Ksize);
         }

         /* Advance to the next block, return false at the end. */
         bool advance(void) {
             if (_done) {
                 return false;
             }

             _newkblock=false;
             _x0 += _parent._x_block;
             if (_x0 >= _parent._Nsize) {
                 _x0=0;
                 _k0 += _parent._k_block;
                 if (_k0 >= _parent._Ksize) {
                     _done=true;
                     return false;
                 }
                 _newkblock=true;
             }
             _index++;

             return true;
         }

         unsigned int k0(void) { return _k0; }
         unsigned int x0(void) { return _x0; }
         unsigned int index(void) { return _index; }
         bool done(void) { return _done; }
         bool newkblock(void) { return _newkblock; }
     };


 public:
     GemmHybrid(GemmHybrid &) = delete;
     GemmHybrid & operator= (GemmHybrid &) = delete;

     /* Constructor */
     GemmHybrid(const GemmArgs<Tr> &args)
               : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches),
                 _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) {
         const unsigned int L1_size = _ci->get_L1_cache_size();
         const unsigned int L2_size = _ci->get_L2_cache_size();

         _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) *
                        (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll());

         // Work out blocking parameters, or override from config.

         if (args._cfg && args._cfg->inner_block_size) {
             _k_block = args._cfg->inner_block_size;
         } else {
             // k_block: Find out how much of the larger array can be loaded into half the cache.
             // This should account for associative caches.
             _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));

             // Needs to be (at least a single) multiple of the K unroll level.
             _k_block /= strategy::k_unroll();
             _k_block = std::max(_k_block, 1U) * strategy::k_unroll();

             // Now tune to presented problem size; this is how many blocks we need.
             int num_k_blocks = iceildiv(_Ksize, _k_block);

             // So divide the space equally into that many blocks.
             _k_block = iceildiv(_Ksize, num_k_blocks);

             // And round UP to the K unroll level required.
             _k_block = iceildiv(_k_block, strategy::k_unroll());
             _k_block *= strategy::k_unroll();
         }

         if (args._cfg && args._cfg->outer_block_size) {
             _x_block = args._cfg->outer_block_size;
         } else {
             // x_block: Work out how many rows (of length k_block) will fit in the L2
             // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
             _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
                       (sizeof(Toi) * _k_block);

             // Needs to be (at least a single) multiple of the kernel output width.
             _x_block /= strategy::out_width();
             _x_block = std::max(_x_block, 1U) * strategy::out_width();

             // And tune to the presented problem size.
             int num_x_blocks = iceildiv(_Nsize, _x_block);
             _x_block = iceildiv(_Nsize, num_x_blocks);

             _x_block = iceildiv(_x_block, strategy::out_width());
             _x_block *= strategy::out_width();
         }

         // Work out the rounded size of M - needed for some buffers.
         _Mround = iceildiv(_Msize, strategy::out_height());
         _Mround *= strategy::out_height();
     }

     // Interface implementation - Compulsory functions

     // Window size: Only the last thread should do a ragged block, so dole
     // out work in units of out_height.  Factor batches and multi into the
     // window too.
     unsigned int get_window_size() const override {
         // _Mround is a multiple of out_height by definition.
         return (_Mround / strategy::out_height()) * _nbatches * _nmulti;
     }

     // Execute
     void execute(unsigned int start, unsigned int end, int threadid) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
         strategy strat(_ci);

         /* Make sure we've been set up correctly. */
         assert(_B_transposed);

         const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height());
         const unsigned int window_per_multi = window_per_batch * _nbatches;

         const unsigned int first_multi = start / window_per_multi;
         const unsigned int last_multi  = end / window_per_multi;

         const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
         const unsigned int last_batch  = (end - (last_multi * window_per_multi)) / window_per_batch;

         const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
         const unsigned int last_row  = ((end   - (last_multi  * window_per_multi)) % window_per_batch) * strategy::out_height();

         static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");

         for (unsigned int multi = first_multi; multi <= last_multi; multi++) {
             const unsigned int batch_0   = (multi == first_multi) ? first_batch : 0;
             const unsigned int batch_max = (multi == last_multi)  ? last_batch : (_nbatches - 1);

             const Toi *b_panel = _B_transposed + (multi * _B_per_multi);

             for (blockwalker current(*this); !current.done(); current.advance()) {
                 int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
                 kern_k *= strat.k_unroll();

                 int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());

                 for (unsigned int batch = batch_0; batch <= batch_max; batch++) {
                     const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
                     const unsigned int m_end   = ((multi == last_multi)  && (batch == last_batch) ) ? last_row  : _Msize;
 #ifdef CYCLE_PROFILING
                     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width());
 #endif

                     strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda,
                                  b_panel,
                                  this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc,
                                  (current.k0() == 0) ? _beta : static_cast<Tr>(1),
                                  (m_end - m_start), (current.xmax() - current.x0()), kern_k);
                 }

                 b_panel += (bblocks * strat.out_width() * kern_k);
             }
         }
     }

     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
         return true;
     }

     bool B_pretranspose_required() const override {
         return (_B_transposed==nullptr);
     }

     size_t get_B_pretransposed_array_size() const override {
         return _B_per_multi * _nmulti * sizeof(Toi);
     }

     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
         strategy strat(_ci);

         for (unsigned int multi=0; multi < _nmulti; multi++) {
             blockwalker current(*this);

             do {
                 /* Figure out the size of each block. */
                 size_t x_size = (current.xmax() - current.x0());
                 size_t k_size = (current.kmax() - current.k0());

                 /* Round sizes up as needed. */
                 x_size = iceildiv(x_size, strategy::out_width());
                 x_size *= strategy::out_width();

                 k_size = iceildiv(k_size, strategy::k_unroll());
                 k_size *= strategy::k_unroll();

                 strat.transforms.PrepareB(
                                buffer, B + (multi * B_multi_stride), ldb,
                                current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);

                 buffer += (x_size * k_size);
             } while (current.advance());
         }
     }

     void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 };

 } // namespace arm_gemm
	/*
	* Copyright (c) 2019 ARM Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#pragma once

	#include <assert.h>

	#include <algorithm>

	#include "arm_gemm.hpp"
	#include "utils.hpp"

	#include "mergeresults.hpp"
	#include "transform.hpp"

	#ifdef CYCLE_PROFILING
	#include "profiler.hpp"
	#endif

	namespace arm_gemm {

	// Implementation of the GemmCommon abstract class.
	template<typename strategy, typename To, typename Tr>
	class GemmHybrid : public GemmCommon<To, Tr> {
	typedef typename strategy::operand_type Toi;
	typedef typename strategy::result_type Tri;

	/* const properties set by constructor */
	const CPUInfo * const _ci;

	const unsigned int _Msize;
	const unsigned int _Nsize;
	const unsigned int _Ksize;

	const unsigned int _nbatches;
	const unsigned int _nmulti;

	const bool _trB;

	const Tr _beta;

	/* Blocking info */
	unsigned int _k_block=0;
	unsigned int _x_block=0;
	unsigned int _Mround=0;

	/* Pretransposed buffer. */
	const Toi *_B_transposed=nullptr;

	unsigned int _B_per_multi = 0;

	/* We will need to walk through the blocks of B in a few contexts, so
	* factor that out. */
	class blockwalker {
	private:
	/* Size loops, etc. based on our parent's configuration */
	const GemmHybrid<strategy, To, Tr> &_parent;

	/* K, X and multi parameters for current iteration. */
	unsigned int _k0=0, _x0=0;

	unsigned int _index=0;
	bool _done=false;
	bool _newkblock=true;

	public:
	blockwalker(const GemmHybrid<strategy, To, Tr> &parent) : _parent(parent) { }

	unsigned int xmax() {
	return std::min(_x0 + _parent._x_block, _parent._Nsize);
	}

	unsigned int kmax() {
	return std::min(_k0 + _parent._k_block, _parent._Ksize);
	}

	/* Advance to the next block, return false at the end. */
	bool advance(void) {
	if (_done) {
	return false;
	}

	_newkblock=false;
	_x0 += _parent._x_block;
	if (_x0 >= _parent._Nsize) {
	_x0=0;
	_k0 += _parent._k_block;
	if (_k0 >= _parent._Ksize) {
	_done=true;
	return false;
	}
	_newkblock=true;
	}
	_index++;

	return true;
	}

	unsigned int k0(void) { return _k0; }
	unsigned int x0(void) { return _x0; }
	unsigned int index(void) { return _index; }
	bool done(void) { return _done; }
	bool newkblock(void) { return _newkblock; }
	};


	public:
	GemmHybrid(GemmHybrid &) = delete;
	GemmHybrid & operator= (GemmHybrid &) = delete;

	/* Constructor */
	GemmHybrid(const GemmArgs<Tr> &args)
	: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches),
	_nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) {
	const unsigned int L1_size = _ci->get_L1_cache_size();
	const unsigned int L2_size = _ci->get_L2_cache_size();

	_B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) *
	(iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll());

	// Work out blocking parameters, or override from config.

	if (args._cfg && args._cfg->inner_block_size) {
	_k_block = args._cfg->inner_block_size;
	} else {
	// k_block: Find out how much of the larger array can be loaded into half the cache.
	// This should account for associative caches.
	_k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));

	// Needs to be (at least a single) multiple of the K unroll level.
	_k_block /= strategy::k_unroll();
	_k_block = std::max(_k_block, 1U) * strategy::k_unroll();

	// Now tune to presented problem size; this is how many blocks we need.
	int num_k_blocks = iceildiv(_Ksize, _k_block);

	// So divide the space equally into that many blocks.
	_k_block = iceildiv(_Ksize, num_k_blocks);

	// And round UP to the K unroll level required.
	_k_block = iceildiv(_k_block, strategy::k_unroll());
	_k_block *= strategy::k_unroll();
	}

	if (args._cfg && args._cfg->outer_block_size) {
	_x_block = args._cfg->outer_block_size;
	} else {
	// x_block: Work out how many rows (of length k_block) will fit in the L2
	// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
	_x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
	(sizeof(Toi) * _k_block);

	// Needs to be (at least a single) multiple of the kernel output width.
	_x_block /= strategy::out_width();
	_x_block = std::max(_x_block, 1U) * strategy::out_width();

	// And tune to the presented problem size.
	int num_x_blocks = iceildiv(_Nsize, _x_block);
	_x_block = iceildiv(_Nsize, num_x_blocks);

	_x_block = iceildiv(_x_block, strategy::out_width());
	_x_block *= strategy::out_width();
	}

	// Work out the rounded size of M - needed for some buffers.
	_Mround = iceildiv(_Msize, strategy::out_height());
	_Mround *= strategy::out_height();
	}

	// Interface implementation - Compulsory functions

	// Window size: Only the last thread should do a ragged block, so dole
	// out work in units of out_height. Factor batches and multi into the
	// window too.
	unsigned int get_window_size() const override {
	// _Mround is a multiple of out_height by definition.
	return (_Mround / strategy::out_height()) * _nbatches * _nmulti;
	}

	// Execute
	void execute(unsigned int start, unsigned int end, int threadid) override {
	#ifdef CYCLE_PROFILING
	profiler prof;
	#endif
	strategy strat(_ci);

	/* Make sure we've been set up correctly. */
	assert(_B_transposed);

	const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height());
	const unsigned int window_per_multi = window_per_batch * _nbatches;

	const unsigned int first_multi = start / window_per_multi;
	const unsigned int last_multi = end / window_per_multi;

	const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
	const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch;

	const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
	const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height();

	static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
	static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");

	for (unsigned int multi = first_multi; multi <= last_multi; multi++) {
	const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0;
	const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1);

	const Toi b_panel = _B_transposed + (multi _B_per_multi);

	for (blockwalker current(*this); !current.done(); current.advance()) {
	int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
	kern_k *= strat.k_unroll();

	int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());

	for (unsigned int batch = batch_0; batch <= batch_max; batch++) {
	const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
	const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize;
	#ifdef CYCLE_PROFILING
	auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width());
	#endif

	strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda,
	b_panel,
	this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc,
	(current.k0() == 0) ? _beta : static_cast<Tr>(1),
	(m_end - m_start), (current.xmax() - current.x0()), kern_k);
	}

	b_panel += (bblocks * strat.out_width() * kern_k);
	}
	}
	}

	// Interface implementation - pretransposed
	bool B_is_pretransposed() const override {
	return true;
	}

	bool B_pretranspose_required() const override {
	return (_B_transposed==nullptr);
	}

	size_t get_B_pretransposed_array_size() const override {
	return _B_per_multi * _nmulti * sizeof(Toi);
	}

	void pretranspose_B_array(void in_buffer, const To B, const int ldb, const int B_multi_stride) override {
	Toi buffer = reinterpret_cast<Toi >(in_buffer);
	_B_transposed = buffer;
	strategy strat(_ci);

	for (unsigned int multi=0; multi < _nmulti; multi++) {
	blockwalker current(*this);

	do {
	/* Figure out the size of each block. */
	size_t x_size = (current.xmax() - current.x0());
	size_t k_size = (current.kmax() - current.k0());

	/* Round sizes up as needed. */
	x_size = iceildiv(x_size, strategy::out_width());
	x_size *= strategy::out_width();

	k_size = iceildiv(k_size, strategy::k_unroll());
	k_size *= strategy::k_unroll();

	strat.transforms.PrepareB(
	buffer, B + (multi * B_multi_stride), ldb,
	current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);

	buffer += (x_size * k_size);
	} while (current.advance());
	}
	}

	void set_pretransposed_B_data(void *in_buffer) override {
	_B_transposed = reinterpret_cast<Toi *>(in_buffer);
	}
	};

	} // namespace arm_gemm