src/core/NEON/kernels/arm_gemm/utils.hpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #pragma once

 #include "arm_gemm.hpp"

 #include <cstddef>

 // Macro for unreachable code (e.g. impossible default cases on switch)
 #define UNREACHABLE(why)  __builtin_unreachable()

 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)

 namespace arm_gemm {

 template<typename T>
 std::string get_type_name() {
 #ifdef __GNUC__
     std::string s = __PRETTY_FUNCTION__;

     auto start = s.find("cls_");

     if (start==std::string::npos) {
         return "(unknown)";
     }

     for(size_t x = start+4; x<s.size(); x++) {
         if (s[x] == ';' || s[x] == ']') {
             return s.substr(start+4, x-(start+4));
         }
     }

     return "(unknown)";
 #else
     return "(unsupported)";
 #endif
 }

 template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
 }

 template <typename T>
 inline T roundup(const T a, const T b) {
     T rem = a % b;

     if (rem) {
         return a + b - rem;
     } else {
         return a;
     }
 }

 enum class VLType {
     None,
     SVE,
 };

 template<typename T>
 struct IndirectOutputArg {
     struct {
         T       *base;
         size_t   stride;
     } direct = {};
     struct {
         T * const *ptr;
         size_t     offset;
     } indirect = {};
     bool is_indirect;

     // Direct
     IndirectOutputArg(T *base, size_t stride) : is_indirect(false) {
         direct.base = base;
         direct.stride = stride;
     }

     // Indirect
     IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) {
         indirect.ptr = ptr;
         indirect.offset = offset;
     }

     IndirectOutputArg() : is_indirect(false) {
         direct.base = nullptr;
         direct.stride = 0;
     }
 };

 // Check that the provided Requantize32 doesn't have a left shift.
 inline bool quant_no_left_shift(const Requantize32 &qp) {
     if (qp.per_channel_requant) {
         return (qp.per_channel_left_shifts == nullptr);
     } else {
         return (qp.per_layer_left_shift == 0);
     }
 }

 // Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels.  These don't include row
 // sums, so the 'b_offset' has to be zero.
 inline bool quant_hybrid_symmetric(const Requantize32 &qp) {
     return quant_no_left_shift(qp) && qp.b_offset == 0;
 }

 // Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels.  These don't support per
 // channel quantization.  Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0...
 inline bool quant_hybrid_asymmetric(const Requantize32 &qp) {
     return quant_no_left_shift(qp) /*  && qp.b_offset != 0 */ && qp.per_channel_requant==false;
 }

 template<typename T>
 struct IndirectInputArg {
     struct {
         const T *base;
         size_t   stride;
     } direct = {};
     struct {
         const T * const * const * ptr;
         unsigned int start_row;
         unsigned int start_col;
     } indirect = {};
     bool is_indirect;

     // Direct
     IndirectInputArg(const T *base, size_t stride) : is_indirect(false) {
         direct.base = base;
         direct.stride = stride;
     }

     // Indirect
     IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) {
         indirect.ptr = ptr;
         indirect.start_row = start_row;
         indirect.start_col = start_col;
     }

     IndirectInputArg() : is_indirect(false) {
         direct.base = nullptr;
         direct.stride = 0;
     }
 };

 namespace utils {

 // get_vector_length(): Returns SVE vector length for type "T".
 //
 // It is required that this can be compiled by a compiler in non-SVE mode, but it must be prevented from running (at
 // runtime) if SVE is not enabled.  Typically this is used by switchyard/driver code which is built in normal mode
 // which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime.
 template <typename T>
 inline unsigned long get_vector_length() {
 #if defined(__aarch64__)
     uint64_t vl;

     __asm __volatile (
         ".inst 0x0420e3e0\n" // CNTB X0, ALL, MUL #1
         "mov %0, X0\n"
         : "=r" (vl)
         :
         : "x0"
     );

     return vl / sizeof(T);
 #else // !defined(__aarch64__)
     return 16 / sizeof(T);
 #endif // defined(__aarch64__)
 }

 // get_vector_length(VLType): Returns vector length for type "T".
 //
 // This has the same requirements and constraints as the SVE-only form above, so we call into that code for SVE.

 template <typename T>
 inline unsigned long get_vector_length(VLType vl_type) {
   switch (vl_type) {
     case VLType::SVE:
       return get_vector_length<T>();
     default:
       return 16 / sizeof(T);
   }
 }
 } // utils namespace
 } // arm_gemm namespace

 using namespace arm_gemm::utils;
	/*
	* Copyright (c) 2017-2021 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#pragma once

	#include "arm_gemm.hpp"

	#include <cstddef>

	// Macro for unreachable code (e.g. impossible default cases on switch)
	#define UNREACHABLE(why) __builtin_unreachable()

	// Paranoid option for the above with assert
	// #define UNREACHABLE(why) assert(0 && why)

	namespace arm_gemm {

	template<typename T>
	std::string get_type_name() {
	#ifdef __GNUC__
	std::string s = __PRETTY_FUNCTION__;

	auto start = s.find("cls_");

	if (start==std::string::npos) {
	return "(unknown)";
	}

	for(size_t x = start+4; x<s.size(); x++) {
	if (s[x] == ';' \|\| s[x] == ']') {
	return s.substr(start+4, x-(start+4));
	}
	}

	return "(unknown)";
	#else
	return "(unsupported)";
	#endif
	}

	template<typename T>
	inline T iceildiv(const T a, const T b) {
	return (a + b - 1) / b;
	}

	template <typename T>
	inline T roundup(const T a, const T b) {
	T rem = a % b;

	if (rem) {
	return a + b - rem;
	} else {
	return a;
	}
	}

	enum class VLType {
	None,
	SVE,
	};

	template<typename T>
	struct IndirectOutputArg {
	struct {
	T *base;
	size_t stride;
	} direct = {};
	struct {
	T * const *ptr;
	size_t offset;
	} indirect = {};
	bool is_indirect;

	// Direct
	IndirectOutputArg(T *base, size_t stride) : is_indirect(false) {
	direct.base = base;
	direct.stride = stride;
	}

	// Indirect
	IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) {
	indirect.ptr = ptr;
	indirect.offset = offset;
	}

	IndirectOutputArg() : is_indirect(false) {
	direct.base = nullptr;
	direct.stride = 0;
	}
	};

	// Check that the provided Requantize32 doesn't have a left shift.
	inline bool quant_no_left_shift(const Requantize32 &qp) {
	if (qp.per_channel_requant) {
	return (qp.per_channel_left_shifts == nullptr);
	} else {
	return (qp.per_layer_left_shift == 0);
	}
	}

	// Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels. These don't include row
	// sums, so the 'b_offset' has to be zero.
	inline bool quant_hybrid_symmetric(const Requantize32 &qp) {
	return quant_no_left_shift(qp) && qp.b_offset == 0;
	}

	// Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels. These don't support per
	// channel quantization. Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0...
	inline bool quant_hybrid_asymmetric(const Requantize32 &qp) {
	return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false;
	}

	template<typename T>
	struct IndirectInputArg {
	struct {
	const T *base;
	size_t stride;
	} direct = {};
	struct {
	const T * const * const * ptr;
	unsigned int start_row;
	unsigned int start_col;
	} indirect = {};
	bool is_indirect;

	// Direct
	IndirectInputArg(const T *base, size_t stride) : is_indirect(false) {
	direct.base = base;
	direct.stride = stride;
	}

	// Indirect
	IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) {
	indirect.ptr = ptr;
	indirect.start_row = start_row;
	indirect.start_col = start_col;
	}

	IndirectInputArg() : is_indirect(false) {
	direct.base = nullptr;
	direct.stride = 0;
	}
	};

	namespace utils {

	// get_vector_length(): Returns SVE vector length for type "T".
	//
	// It is required that this can be compiled by a compiler in non-SVE mode, but it must be prevented from running (at
	// runtime) if SVE is not enabled. Typically this is used by switchyard/driver code which is built in normal mode
	// which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime.
	template <typename T>
	inline unsigned long get_vector_length() {
	#if defined(__aarch64__)
	uint64_t vl;

	__asm __volatile (
	".inst 0x0420e3e0\n" // CNTB X0, ALL, MUL #1
	"mov %0, X0\n"
	: "=r" (vl)
	:
	: "x0"
	);

	return vl / sizeof(T);
	#else // !defined(__aarch64__)
	return 16 / sizeof(T);
	#endif // defined(__aarch64__)
	}

	// get_vector_length(VLType): Returns vector length for type "T".
	//
	// This has the same requirements and constraints as the SVE-only form above, so we call into that code for SVE.

	template <typename T>
	inline unsigned long get_vector_length(VLType vl_type) {
	switch (vl_type) {
	case VLType::SVE:
	return get_vector_length<T>();
	default:
	return 16 / sizeof(T);
	}
	}
	} // utils namespace
	} // arm_gemm namespace

	using namespace arm_gemm::utils;