include/float_utils.h - tosa/serialization_lib - Gitiles

 // Copyright (c) 2024, ARM Limited.
 //
 //    Licensed under the Apache License, Version 2.0 (the "License");
 //    you may not use this file except in compliance with the License.
 //    You may obtain a copy of the License at
 //
 //         http://www.apache.org/licenses/LICENSE-2.0
 //
 //    Unless required by applicable law or agreed to in writing, software
 //    distributed under the License is distributed on an "AS IS" BASIS,
 //    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //    See the License for the specific language governing permissions and
 //    limitations under the License.

 #ifndef TOSA_FLOAT_UTILS_H_
 #define TOSA_FLOAT_UTILS_H_

 #include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
 #if defined(__cpp_lib_bit_cast)
 #include <bit>
 #endif    // defined(__cpp_lib_bit_cast)

 namespace tosa
 {

 namespace float_support
 {

 struct hidden
 {};

 #if defined(__cpp_lib_bit_cast)
 #define BITCAST_CONSTEXPR constexpr inline

 constexpr inline int32_t get_bits(const float& f)
 {
     return std::bit_cast<int32_t>(f);
 }
 constexpr inline float from_bits(const int32_t& i)
 {
     return std::bit_cast<float>(i);
 }

 #else
 #define BITCAST_CONSTEXPR inline

 union ufloat32
 {
     constexpr ufloat32(const float& x)
         : f(x)
     {}
     constexpr ufloat32(const int32_t& x)
         : i(x)
     {}

     float f;
     int32_t i;
 };

 inline int32_t get_bits(const float& f)
 {
     return ufloat32(f).i;
 }
 inline float from_bits(const int32_t& i)
 {
     return ufloat32(i).f;
 }
 #endif

 }    // namespace float_support

 template <typename storage_t,
           size_t n_exp_bits,
           bool has_nan,
           bool with_denorm,
           bool with_infinity,
           std::enable_if_t<(n_exp_bits + 1 < sizeof(storage_t) * 8), bool> = true>
 class float_t
 {
     storage_t m_data = 0;

 public:
     static constexpr size_t n_exponent_bits    = n_exp_bits;
     static constexpr size_t n_significand_bits = sizeof(storage_t) * 8 - 1 - n_exp_bits;
     static constexpr int64_t exponent_bias     = (1 << (n_exp_bits - 1)) - 1;

     /// \brief Construct a floating point type with the given bit
     /// representation.
     static constexpr float_t from_bits(storage_t bits)
     {
         return float_t(float_support::hidden(), bits);
     }

     /// \brief Construct a float from the given sign, exponent and significand
     /// bits.
     static constexpr float_t from_bits(bool pm, storage_t e, storage_t s)
     {
         storage_t bits = pm ? 1 : 0;

         bits <<= n_exp_bits;
         bits |= e;

         bits <<= n_significand_bits;
         if (with_denorm || e)
             bits |= s;

         return float_t(float_support::hidden(), bits);
     }

     /// \brief (Hidden) Construct a float type from a given bit pattern
     constexpr float_t(const float_support::hidden&, storage_t bits)
         : m_data(bits)
     {}

     constexpr float_t()
         : m_data(0)
     {}
     constexpr float_t(const float_t& other)
         : m_data(other.m_data)
     {}

     /// \brief Cast to a different floating point representation.
     template <typename other_storage_t,
               size_t other_n_exp_bits,
               bool other_has_nan,
               bool other_has_denorm,
               bool other_has_infinity>
     constexpr inline
         operator float_t<other_storage_t, other_n_exp_bits, other_has_nan, other_has_denorm, other_has_infinity>() const
     {
         using other_float_t =
             float_t<other_storage_t, other_n_exp_bits, other_has_nan, other_has_denorm, other_has_infinity>;

         // Shortcut for types which are fundamentally similar (e.g., bf16 ->
         // fp32)
         if constexpr (n_exp_bits == other_n_exp_bits && sizeof(other_storage_t) >= sizeof(storage_t) &&
                       has_nan == other_has_nan)
         {
             return other_float_t::from_bits(static_cast<other_storage_t>(m_data)
                                             << (sizeof(other_storage_t) - sizeof(storage_t)) * 8);
         }

         // Get initial values for the new floating point type
         const bool sign_bit       = m_data < 0;
         int64_t new_exponent_bits = 0;
         uint64_t new_significand  = 0;

         if (is_nan() || is_infinity())
         {
             new_exponent_bits = (1 << other_n_exp_bits) - 1;

             if (is_nan())
             {
                 if constexpr (other_has_infinity)
                 {
                     // Copy across the `not_quiet bit`; set the LSB. Don't
                     // attempt to copy across any of the rest of the payload.
                     new_significand =
                         0x1 | (((significand() >> (n_significand_bits - 1)) & 1) << other_float_t::n_significand_bits);
                 }
                 else
                 {
                     new_significand = (1ul << other_float_t::n_significand_bits) - 1;
                 }
             }
             else if constexpr (!other_has_infinity)
             {
                 new_significand = (1ul << other_float_t::n_significand_bits) - (other_has_nan ? 2 : 1);
             }
         }
         else if (!is_zero())
         {
             const int64_t this_exponent_bits = exponent_bits();
             {
                 constexpr int64_t exponent_rebias = other_float_t::exponent_bias - exponent_bias;
                 new_exponent_bits                 = std::max(this_exponent_bits + exponent_rebias, exponent_rebias + 1);
             }
             new_significand = this->significand() << (64 - n_significand_bits);

             // Normalise subnormals
             if (this_exponent_bits == 0)
             {
                 // Shift the most-significant 1 out of the magnitude to convert
                 // it to a significand. Modify the exponent accordingly.
                 uint8_t shift = __builtin_clzl(new_significand) + 1;
                 new_exponent_bits -= shift;
                 new_significand <<= shift;
             }

             // Align the significand for the output type
             uint32_t shift                = 64 - other_float_t::n_significand_bits;
             const bool other_is_subnormal = new_exponent_bits <= 0;
             if (other_is_subnormal)
             {
                 shift += 1 - new_exponent_bits;
                 new_exponent_bits = 0;
             }

             const uint64_t shift_out = shift == 64 ? new_significand : new_significand & ((1ll << shift) - 1);
             new_significand          = shift == 64 ? 0 : new_significand >> shift;

             // Reinsert the most-significant-one if this is a subnormal in the
             // output type.
             new_significand |= (other_is_subnormal ? 1ll : 0) << (64 - shift);

             // Apply rounding based on the bits shifted out of the significand
             const uint64_t shift_half = 1ll << (shift - 1);
             if (shift_out > shift_half || (shift_out == shift_half && (new_significand & 1)))
             {
                 new_significand += 1;

                 // Handle the case that the significand overflowed due to
                 // rounding
                 constexpr uint64_t max_significand = (1ll << other_float_t::n_significand_bits) - 1;
                 if (new_significand > max_significand)
                 {
                     new_significand = 0;
                     new_exponent_bits++;
                 }
             }

             // Saturate to infinity if the exponent is larger than can be
             // represented in the output type. This can only occur if the size
             // of the exponent of the new type is not greater than the exponent
             // of the old type.
             if constexpr (other_n_exp_bits <= n_exp_bits)
             {
                 constexpr int64_t inf_exp_bits = (1ll << other_n_exp_bits) - 1;
                 if (new_exponent_bits >= inf_exp_bits)
                 {
                     new_exponent_bits = inf_exp_bits;
                     new_significand =
                         other_has_infinity ? 0 : (1ul << other_float_t::n_significand_bits) - (other_has_nan ? 2 : 1);
                 }
             }
         }

         return other_float_t::from_bits(sign_bit, new_exponent_bits, new_significand);
     }

     /// \brief Convert from a 32-bit floating point value
     BITCAST_CONSTEXPR
     float_t(const float& f)
     {
         // If this format exactly represents the binary32 format then get
         // the bits from the provided float; otherwise get a binary32
         // representation and then convert to this format.
         if constexpr (represents_binary32())
             m_data = float_support::get_bits(f);
         else
             m_data = static_cast<float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_infinity>>(
                          static_cast<float_t<int32_t, 8, true, true, true>>(f))
                          .m_data;
     }

     /// \brief Cast to a 32-bit floating point value
     BITCAST_CONSTEXPR operator float() const
     {
         // If this format exactly represents the binary32 format then return
         // a float; otherwise get a binary32 representation and then return
         // a float.
         if constexpr (represents_binary32())
             return float_support::from_bits(m_data);
         else
             return static_cast<float>(this->operator float_t<int32_t, 8, true, true, true>());
     }

     /// \brief Return whether this type represents the IEEE754 binary32
     /// format
     constexpr static inline bool represents_binary32()
     {
         return std::is_same_v<storage_t, int32_t> && n_exp_bits == 8 && has_nan && with_denorm && with_infinity;
     }

     constexpr auto operator-() const
     {
         return from_bits(m_data ^ (1ll << (sizeof(storage_t) * 8 - 1)));
     }

     constexpr bool is_subnormal() const
     {
         return exponent_bits() == 0 && significand() != 0;
     }

     constexpr bool is_zero() const
     {
         return exponent_bits() == 0 && significand() == 0;
     }

     constexpr bool is_nan() const
     {
         return has_nan && (exponent_bits() == (1ul << n_exponent_bits) - 1) &&
                ((with_infinity && significand()) ||
                 (!with_infinity && significand() == (1ul << n_significand_bits) - 1));
     }

     constexpr bool is_infinity() const
     {
         return with_infinity && ((exponent_bits() == (1ul << n_exponent_bits) - 1) && !significand());
     }

     constexpr inline const storage_t& bits() const
     {
         return m_data;
     }

     /// \brief Get the exponent
     constexpr inline int64_t exponent() const
     {
         return std::max<int64_t>(exponent_bits(), 1ul) - exponent_bias;
     }

     /// \brief Get the bits from the exponent
     constexpr inline uint64_t exponent_bits() const
     {
         constexpr uint64_t mask = (1ul << n_exp_bits) - 1;
         return (m_data >> n_significand_bits) & mask;
     }

     constexpr inline uint64_t significand() const
     {
         return m_data & ((1ul << n_significand_bits) - 1);
     }

     constexpr inline bool operator==(const float_t& other) const
     {
         return !is_nan() && !other.is_nan() && ((is_zero() && other.is_zero()) || bits() == other.bits());
     }

     constexpr inline float_t& operator+=(const float_t& rhs)
     {
         this->m_data = static_cast<float_t>(static_cast<float>(*this) + static_cast<float>(rhs)).bits();
         return *this;
     }
 };

 // This should probably be exported so we can use it elsewhere
 #undef BITCAST_CONSTEXPR

 namespace float_support
 {

 // Pre-C++23 these can't be computed as constexpr, so have to hardcode them

 template <int>
 struct digits10;    // floor(log10(2) * (digits - 1)
 template <int>
 struct max_digits10;    // ceil(log10(2) * digits + 1)
 template <int>
 struct min_exponent10;    // floor(log10(2) * min_exponent)
 template <int>
 struct max_exponent10;    // floor(log10(2) * max_exponent)

 template <>
 struct digits10<8>
 {
     constexpr static inline int value = 2;
 };

 template <>
 struct max_digits10<8>
 {
     constexpr static inline int value = 4;
 };

 template <>
 struct digits10<10>
 {
     constexpr static inline int value = 2;
 };

 template <>
 struct max_digits10<10>
 {
     constexpr static inline int value = 5;
 };

 template <>
 struct digits10<24>
 {
     constexpr static inline int value = 6;
 };

 template <>
 struct max_digits10<24>
 {
     constexpr static inline int value = 9;
 };

 template <>
 struct min_exponent10<-13>
 {
     constexpr static inline int value = -3;
 };

 template <>
 struct max_exponent10<16>
 {
     constexpr static inline int value = 4;
 };

 template <>
 struct min_exponent10<-125>
 {
     constexpr static inline int value = -37;
 };

 template <>
 struct max_exponent10<128>
 {
     constexpr static inline int value = 38;
 };

 template <int d>
 inline constexpr int digits10_v = digits10<d>::value;
 template <int d>
 inline constexpr int max_digits10_v = max_digits10<d>::value;

 template <int e>
 inline constexpr int min_exponent10_v = min_exponent10<e>::value;

 template <int e>
 inline constexpr int max_exponent10_v = max_exponent10<e>::value;

 }    // namespace float_support

 }    // namespace tosa

 namespace std
 {

 template <typename storage_t, size_t n_exp_bits, bool has_nan, bool has_denorm, bool has_inf>
 struct is_floating_point<tosa::float_t<storage_t, n_exp_bits, has_nan, has_denorm, has_inf>>
     : std::integral_constant<bool, true>
 {};

 template <typename storage_t, size_t n_exp_bits, bool has_nan, bool with_denorm, bool with_inf>
 class numeric_limits<tosa::float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_inf>>
 {
     using this_float_t = tosa::float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_inf>;

 public:
     static constexpr bool is_specialized = true;

     static constexpr auto min() noexcept
     {
         return this_float_t::from_bits(false, 1, 0);
     }

     static constexpr auto max() noexcept
     {
         return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 2,
                                        (1 << this_float_t::n_significand_bits) - 1);
     }

     static constexpr auto lowest() noexcept
     {
         return -max();
     }

     static constexpr int digits       = this_float_t::n_significand_bits + 1;
     static constexpr int digits10     = tosa::float_support::digits10_v<digits>;
     static constexpr int max_digits10 = tosa::float_support::max_digits10_v<digits>;

     static constexpr bool is_signed  = true;
     static constexpr bool is_integer = false;
     static constexpr bool is_exact   = false;
     static constexpr int radix       = 2;

     static constexpr auto epsilon() noexcept
     {
         return this_float_t::from_bits(false, this_float_t::exponent_bias - this_float_t::n_significand_bits, 0);
     }

     static constexpr auto round_error() noexcept
     {
         return this_float_t::from_bits(0, this_float_t::exponent_bias - 1, 0);
     }

     static constexpr int min_exponent   = (1 - this_float_t::exponent_bias) + 1;
     static constexpr int min_exponent10 = tosa::float_support::min_exponent10_v<min_exponent>;
     static constexpr int max_exponent   = this_float_t::exponent_bias + 1;
     static constexpr int max_exponent10 = tosa::float_support::max_exponent10_v<max_exponent>;

     static constexpr bool has_infinity             = with_inf;
     static constexpr bool has_quiet_NaN            = has_nan;
     static constexpr bool has_signaling_NaN        = true;
     static constexpr float_denorm_style has_denorm = with_denorm ? denorm_present : denorm_absent;
     static constexpr bool has_denorm_loss          = false;

     static constexpr auto infinity() noexcept
     {
         if constexpr (with_inf)
         {
             return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1, 0);
         }
         else
         {
             return this_float_t::from_bits(false, 0, 0);
         }
     }

     static constexpr auto quiet_NaN() noexcept
     {
         return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1,
                                        1 << (this_float_t::n_significand_bits - 1) | 1);
     }

     static constexpr auto signaling_NaN() noexcept
     {
         return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1, 1);
     }

     static constexpr auto denorm_min() noexcept
     {
         return this_float_t::from_bits(false, 0, 1);
     }

     static constexpr bool is_iec559  = false;
     static constexpr bool is_bounded = false;
     static constexpr bool is_modulo  = false;

     static constexpr bool traps                    = false;
     static constexpr bool tinyness_before          = false;
     static constexpr float_round_style round_style = round_to_nearest;
 };

 }    // namespace std

 #endif    //  TOSA_FLOAT_UTILS_H_
	// Copyright (c) 2024, ARM Limited.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef TOSA_FLOAT_UTILS_H_
	#define TOSA_FLOAT_UTILS_H_

	#include <algorithm>
	#include <cstdint>
	#include <limits>
	#include <type_traits>
	#if defined(__cpp_lib_bit_cast)
	#include <bit>
	#endif // defined(__cpp_lib_bit_cast)

	namespace tosa
	{

	namespace float_support
	{

	struct hidden
	{};

	#if defined(__cpp_lib_bit_cast)
	#define BITCAST_CONSTEXPR constexpr inline

	constexpr inline int32_t get_bits(const float& f)
	{
	return std::bit_cast<int32_t>(f);
	}
	constexpr inline float from_bits(const int32_t& i)
	{
	return std::bit_cast<float>(i);
	}

	#else
	#define BITCAST_CONSTEXPR inline

	union ufloat32
	{
	constexpr ufloat32(const float& x)
	: f(x)
	{}
	constexpr ufloat32(const int32_t& x)
	: i(x)
	{}

	float f;
	int32_t i;
	};

	inline int32_t get_bits(const float& f)
	{
	return ufloat32(f).i;
	}
	inline float from_bits(const int32_t& i)
	{
	return ufloat32(i).f;
	}
	#endif

	} // namespace float_support

	template <typename storage_t,
	size_t n_exp_bits,
	bool has_nan,
	bool with_denorm,
	bool with_infinity,
	std::enable_if_t<(n_exp_bits + 1 < sizeof(storage_t) * 8), bool> = true>
	class float_t
	{
	storage_t m_data = 0;

	public:
	static constexpr size_t n_exponent_bits = n_exp_bits;
	static constexpr size_t n_significand_bits = sizeof(storage_t) * 8 - 1 - n_exp_bits;
	static constexpr int64_t exponent_bias = (1 << (n_exp_bits - 1)) - 1;

	/// \brief Construct a floating point type with the given bit
	/// representation.
	static constexpr float_t from_bits(storage_t bits)
	{
	return float_t(float_support::hidden(), bits);
	}

	/// \brief Construct a float from the given sign, exponent and significand
	/// bits.
	static constexpr float_t from_bits(bool pm, storage_t e, storage_t s)
	{
	storage_t bits = pm ? 1 : 0;

	bits <<= n_exp_bits;
	bits \|= e;

	bits <<= n_significand_bits;
	if (with_denorm \|\| e)
	bits \|= s;

	return float_t(float_support::hidden(), bits);
	}

	/// \brief (Hidden) Construct a float type from a given bit pattern
	constexpr float_t(const float_support::hidden&, storage_t bits)
	: m_data(bits)
	{}

	constexpr float_t()
	: m_data(0)
	{}
	constexpr float_t(const float_t& other)
	: m_data(other.m_data)
	{}

	/// \brief Cast to a different floating point representation.
	template <typename other_storage_t,
	size_t other_n_exp_bits,
	bool other_has_nan,
	bool other_has_denorm,
	bool other_has_infinity>
	constexpr inline
	operator float_t<other_storage_t, other_n_exp_bits, other_has_nan, other_has_denorm, other_has_infinity>() const
	{
	using other_float_t =
	float_t<other_storage_t, other_n_exp_bits, other_has_nan, other_has_denorm, other_has_infinity>;

	// Shortcut for types which are fundamentally similar (e.g., bf16 ->
	// fp32)
	if constexpr (n_exp_bits == other_n_exp_bits && sizeof(other_storage_t) >= sizeof(storage_t) &&
	has_nan == other_has_nan)
	{
	return other_float_t::from_bits(static_cast<other_storage_t>(m_data)
	<< (sizeof(other_storage_t) - sizeof(storage_t)) * 8);
	}

	// Get initial values for the new floating point type
	const bool sign_bit = m_data < 0;
	int64_t new_exponent_bits = 0;
	uint64_t new_significand = 0;

	if (is_nan() \|\| is_infinity())
	{
	new_exponent_bits = (1 << other_n_exp_bits) - 1;

	if (is_nan())
	{
	if constexpr (other_has_infinity)
	{
	// Copy across the `not_quiet bit`; set the LSB. Don't
	// attempt to copy across any of the rest of the payload.
	new_significand =
	0x1 \| (((significand() >> (n_significand_bits - 1)) & 1) << other_float_t::n_significand_bits);
	}
	else
	{
	new_significand = (1ul << other_float_t::n_significand_bits) - 1;
	}
	}
	else if constexpr (!other_has_infinity)
	{
	new_significand = (1ul << other_float_t::n_significand_bits) - (other_has_nan ? 2 : 1);
	}
	}
	else if (!is_zero())
	{
	const int64_t this_exponent_bits = exponent_bits();
	{
	constexpr int64_t exponent_rebias = other_float_t::exponent_bias - exponent_bias;
	new_exponent_bits = std::max(this_exponent_bits + exponent_rebias, exponent_rebias + 1);
	}
	new_significand = this->significand() << (64 - n_significand_bits);

	// Normalise subnormals
	if (this_exponent_bits == 0)
	{
	// Shift the most-significant 1 out of the magnitude to convert
	// it to a significand. Modify the exponent accordingly.
	uint8_t shift = __builtin_clzl(new_significand) + 1;
	new_exponent_bits -= shift;
	new_significand <<= shift;
	}

	// Align the significand for the output type
	uint32_t shift = 64 - other_float_t::n_significand_bits;
	const bool other_is_subnormal = new_exponent_bits <= 0;
	if (other_is_subnormal)
	{
	shift += 1 - new_exponent_bits;
	new_exponent_bits = 0;
	}

	const uint64_t shift_out = shift == 64 ? new_significand : new_significand & ((1ll << shift) - 1);
	new_significand = shift == 64 ? 0 : new_significand >> shift;

	// Reinsert the most-significant-one if this is a subnormal in the
	// output type.
	new_significand \|= (other_is_subnormal ? 1ll : 0) << (64 - shift);

	// Apply rounding based on the bits shifted out of the significand
	const uint64_t shift_half = 1ll << (shift - 1);
	if (shift_out > shift_half \|\| (shift_out == shift_half && (new_significand & 1)))
	{
	new_significand += 1;

	// Handle the case that the significand overflowed due to
	// rounding
	constexpr uint64_t max_significand = (1ll << other_float_t::n_significand_bits) - 1;
	if (new_significand > max_significand)
	{
	new_significand = 0;
	new_exponent_bits++;
	}
	}

	// Saturate to infinity if the exponent is larger than can be
	// represented in the output type. This can only occur if the size
	// of the exponent of the new type is not greater than the exponent
	// of the old type.
	if constexpr (other_n_exp_bits <= n_exp_bits)
	{
	constexpr int64_t inf_exp_bits = (1ll << other_n_exp_bits) - 1;
	if (new_exponent_bits >= inf_exp_bits)
	{
	new_exponent_bits = inf_exp_bits;
	new_significand =
	other_has_infinity ? 0 : (1ul << other_float_t::n_significand_bits) - (other_has_nan ? 2 : 1);
	}
	}
	}

	return other_float_t::from_bits(sign_bit, new_exponent_bits, new_significand);
	}

	/// \brief Convert from a 32-bit floating point value
	BITCAST_CONSTEXPR
	float_t(const float& f)
	{
	// If this format exactly represents the binary32 format then get
	// the bits from the provided float; otherwise get a binary32
	// representation and then convert to this format.
	if constexpr (represents_binary32())
	m_data = float_support::get_bits(f);
	else
	m_data = static_cast<float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_infinity>>(
	static_cast<float_t<int32_t, 8, true, true, true>>(f))
	.m_data;
	}

	/// \brief Cast to a 32-bit floating point value
	BITCAST_CONSTEXPR operator float() const
	{
	// If this format exactly represents the binary32 format then return
	// a float; otherwise get a binary32 representation and then return
	// a float.
	if constexpr (represents_binary32())
	return float_support::from_bits(m_data);
	else
	return static_cast<float>(this->operator float_t<int32_t, 8, true, true, true>());
	}

	/// \brief Return whether this type represents the IEEE754 binary32
	/// format
	constexpr static inline bool represents_binary32()
	{
	return std::is_same_v<storage_t, int32_t> && n_exp_bits == 8 && has_nan && with_denorm && with_infinity;
	}

	constexpr auto operator-() const
	{
	return from_bits(m_data ^ (1ll << (sizeof(storage_t) * 8 - 1)));
	}

	constexpr bool is_subnormal() const
	{
	return exponent_bits() == 0 && significand() != 0;
	}

	constexpr bool is_zero() const
	{
	return exponent_bits() == 0 && significand() == 0;
	}

	constexpr bool is_nan() const
	{
	return has_nan && (exponent_bits() == (1ul << n_exponent_bits) - 1) &&
	((with_infinity && significand()) \|\|
	(!with_infinity && significand() == (1ul << n_significand_bits) - 1));
	}

	constexpr bool is_infinity() const
	{
	return with_infinity && ((exponent_bits() == (1ul << n_exponent_bits) - 1) && !significand());
	}

	constexpr inline const storage_t& bits() const
	{
	return m_data;
	}

	/// \brief Get the exponent
	constexpr inline int64_t exponent() const
	{
	return std::max<int64_t>(exponent_bits(), 1ul) - exponent_bias;
	}

	/// \brief Get the bits from the exponent
	constexpr inline uint64_t exponent_bits() const
	{
	constexpr uint64_t mask = (1ul << n_exp_bits) - 1;
	return (m_data >> n_significand_bits) & mask;
	}

	constexpr inline uint64_t significand() const
	{
	return m_data & ((1ul << n_significand_bits) - 1);
	}

	constexpr inline bool operator==(const float_t& other) const
	{
	return !is_nan() && !other.is_nan() && ((is_zero() && other.is_zero()) \|\| bits() == other.bits());
	}

	constexpr inline float_t& operator+=(const float_t& rhs)
	{
	this->m_data = static_cast<float_t>(static_cast<float>(*this) + static_cast<float>(rhs)).bits();
	return *this;
	}
	};

	// This should probably be exported so we can use it elsewhere
	#undef BITCAST_CONSTEXPR

	namespace float_support
	{

	// Pre-C++23 these can't be computed as constexpr, so have to hardcode them

	template <int>
	struct digits10; // floor(log10(2) * (digits - 1)
	template <int>
	struct max_digits10; // ceil(log10(2) * digits + 1)
	template <int>
	struct min_exponent10; // floor(log10(2) * min_exponent)
	template <int>
	struct max_exponent10; // floor(log10(2) * max_exponent)

	template <>
	struct digits10<8>
	{
	constexpr static inline int value = 2;
	};

	template <>
	struct max_digits10<8>
	{
	constexpr static inline int value = 4;
	};

	template <>
	struct digits10<10>
	{
	constexpr static inline int value = 2;
	};

	template <>
	struct max_digits10<10>
	{
	constexpr static inline int value = 5;
	};

	template <>
	struct digits10<24>
	{
	constexpr static inline int value = 6;
	};

	template <>
	struct max_digits10<24>
	{
	constexpr static inline int value = 9;
	};

	template <>
	struct min_exponent10<-13>
	{
	constexpr static inline int value = -3;
	};

	template <>
	struct max_exponent10<16>
	{
	constexpr static inline int value = 4;
	};

	template <>
	struct min_exponent10<-125>
	{
	constexpr static inline int value = -37;
	};

	template <>
	struct max_exponent10<128>
	{
	constexpr static inline int value = 38;
	};

	template <int d>
	inline constexpr int digits10_v = digits10<d>::value;
	template <int d>
	inline constexpr int max_digits10_v = max_digits10<d>::value;

	template <int e>
	inline constexpr int min_exponent10_v = min_exponent10<e>::value;

	template <int e>
	inline constexpr int max_exponent10_v = max_exponent10<e>::value;

	} // namespace float_support

	} // namespace tosa

	namespace std
	{

	template <typename storage_t, size_t n_exp_bits, bool has_nan, bool has_denorm, bool has_inf>
	struct is_floating_point<tosa::float_t<storage_t, n_exp_bits, has_nan, has_denorm, has_inf>>
	: std::integral_constant<bool, true>
	{};

	template <typename storage_t, size_t n_exp_bits, bool has_nan, bool with_denorm, bool with_inf>
	class numeric_limits<tosa::float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_inf>>
	{
	using this_float_t = tosa::float_t<storage_t, n_exp_bits, has_nan, with_denorm, with_inf>;

	public:
	static constexpr bool is_specialized = true;

	static constexpr auto min() noexcept
	{
	return this_float_t::from_bits(false, 1, 0);
	}

	static constexpr auto max() noexcept
	{
	return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 2,
	(1 << this_float_t::n_significand_bits) - 1);
	}

	static constexpr auto lowest() noexcept
	{
	return -max();
	}

	static constexpr int digits = this_float_t::n_significand_bits + 1;
	static constexpr int digits10 = tosa::float_support::digits10_v<digits>;
	static constexpr int max_digits10 = tosa::float_support::max_digits10_v<digits>;

	static constexpr bool is_signed = true;
	static constexpr bool is_integer = false;
	static constexpr bool is_exact = false;
	static constexpr int radix = 2;

	static constexpr auto epsilon() noexcept
	{
	return this_float_t::from_bits(false, this_float_t::exponent_bias - this_float_t::n_significand_bits, 0);
	}

	static constexpr auto round_error() noexcept
	{
	return this_float_t::from_bits(0, this_float_t::exponent_bias - 1, 0);
	}

	static constexpr int min_exponent = (1 - this_float_t::exponent_bias) + 1;
	static constexpr int min_exponent10 = tosa::float_support::min_exponent10_v<min_exponent>;
	static constexpr int max_exponent = this_float_t::exponent_bias + 1;
	static constexpr int max_exponent10 = tosa::float_support::max_exponent10_v<max_exponent>;

	static constexpr bool has_infinity = with_inf;
	static constexpr bool has_quiet_NaN = has_nan;
	static constexpr bool has_signaling_NaN = true;
	static constexpr float_denorm_style has_denorm = with_denorm ? denorm_present : denorm_absent;
	static constexpr bool has_denorm_loss = false;

	static constexpr auto infinity() noexcept
	{
	if constexpr (with_inf)
	{
	return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1, 0);
	}
	else
	{
	return this_float_t::from_bits(false, 0, 0);
	}
	}

	static constexpr auto quiet_NaN() noexcept
	{
	return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1,
	1 << (this_float_t::n_significand_bits - 1) \| 1);
	}

	static constexpr auto signaling_NaN() noexcept
	{
	return this_float_t::from_bits(false, (1 << this_float_t::n_exponent_bits) - 1, 1);
	}

	static constexpr auto denorm_min() noexcept
	{
	return this_float_t::from_bits(false, 0, 1);
	}

	static constexpr bool is_iec559 = false;
	static constexpr bool is_bounded = false;
	static constexpr bool is_modulo = false;

	static constexpr bool traps = false;
	static constexpr bool tinyness_before = false;
	static constexpr float_round_style round_style = round_to_nearest;
	};

	} // namespace std

	#endif // TOSA_FLOAT_UTILS_H_