Blame - src/armnnUtils/BFloat16.hpp - ml/armnn

blob: 965fc31c17d24023d11203deb65623dcba749647 [file] [log] [blame]

Narumol Prangnawarat	c3bf6ef	2020-02-28 12:45:21 +0000	[diff] [blame]	1	//
				2	// Copyright © 2020 Arm Ltd. All rights reserved.
				3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#pragma once
				7
Narumol Prangnawarat	e664484	2020-03-05 17:27:45 +0000	[diff] [blame]	8	#include <ostream>
Narumol Prangnawarat	8832522	2020-03-06 14:45:57 +0000	[diff] [blame^]	9	#include <cmath>
Narumol Prangnawarat	c3bf6ef	2020-02-28 12:45:21 +0000	[diff] [blame]	10	#include <stdint.h>
				11
				12	namespace armnn
				13	{
Narumol Prangnawarat	e664484	2020-03-05 17:27:45 +0000	[diff] [blame]	14	class BFloat16
				15	{
				16	public:
				17	BFloat16()
				18	: value(0)
				19	{}
				20
				21	explicit BFloat16(uint16_t v)
				22	: value(v)
				23	{}
				24
				25	explicit BFloat16(float v)
				26	{
				27	value = float32ToBFloat16(v).val();
				28	}
				29
				30	BFloat16& operator=(float v)
				31	{
				32	value = float32ToBFloat16(v).val();
				33	return *this;
				34	}
				35
				36	bool operator==(const BFloat16& r) const
				37	{
				38	return value == r.val();
				39	}
				40
				41	bool operator==(const float& r) const
				42	{
				43	return toFloat32() == r;
				44	}
				45
				46	static BFloat16 float32ToBFloat16(const float v)
				47	{
				48	if (std::isnan(v))
				49	{
				50	return nan();
				51	}
				52	else
				53	{
				54	// Round value to the nearest even
				55	// Float32
				56	// S EEEEEEEE MMMMMMLRMMMMMMMMMMMMMMM
				57	// BFloat16
				58	// S EEEEEEEE MMMMMML
				59	// LSB (L): Least significat bit of BFloat16 (last bit of the Mantissa of BFloat16)
				60	// R: Rounding bit
				61	// LSB = 0, R = 0 -> round down
				62	// LSB = 1, R = 0 -> round down
				63	// LSB = 0, R = 1, all the rest = 0 -> round down
				64	// LSB = 1, R = 1 -> round up
				65	// LSB = 0, R = 1 -> round up
				66	const uint32_t* u32 = reinterpret_cast<const uint32_t*>(&v);
				67	uint16_t u16 = static_cast<uint16_t>(*u32 >> 16u);
				68	// Mark the LSB
				69	const uint16_t lsb = u16 & 0x0001;
				70	// Mark the error to be truncate (the rest of 16 bits of FP32)
				71	const uint16_t error = static_cast<const uint16_t>((*u32 & 0x0000FFFF));
				72	if ((error > 0x8000 \|\| (error == 0x8000 && lsb == 1)))
				73	{
				74	u16++;
				75	}
				76	BFloat16 b(u16);
				77	return b;
				78	}
				79	}
				80
				81	float toFloat32() const
				82	{
				83	const uint32_t u32 = static_cast<const uint32_t>(value << 16u);
				84	const float* f32 = reinterpret_cast<const float*>(&u32);
				85	return *f32;
				86	}
				87
				88	uint16_t val() const
				89	{
				90	return value;
				91	}
				92
				93	static BFloat16 max()
				94	{
				95	uint16_t max = 0x7F7F;
				96	return BFloat16(max);
				97	}
				98
				99	static BFloat16 nan()
				100	{
				101	uint16_t nan = 0x7FC0;
				102	return BFloat16(nan);
				103	}
				104
				105	static BFloat16 inf()
				106	{
				107	uint16_t infVal = 0x7F80;
				108	return BFloat16(infVal);
				109	}
				110
				111	private:
				112	uint16_t value;
				113	};
				114
				115	inline std::ostream& operator<<(std::ostream& os, const BFloat16& b)
				116	{
				117	os << b.toFloat32() << "(0x" << std::hex << b.val() << ")";
				118	return os;
				119	}
				120
Narumol Prangnawarat	c3bf6ef	2020-02-28 12:45:21 +0000	[diff] [blame]	121	} //namespace armnn