Arcane  v3.15.0.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
FloatConversion.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* FloatConversion.h (C) 2000-2024 */
9/* */
10/* Opérations de conversion entre 'float' et 'Float16' et 'BFloat16'. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_BASE_FLOATCONVERSION_H
13#define ARCCORE_BASE_FLOATCONVERSION_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arccore/base/ArccoreGlobal.h"
18
19#include <cstring>
20
21// TODO: Utiliser 'std::endian' lorsqu'on sera en C++20.
22// TODO: Utiliser 'std::bit_cast' au lieu de 'std::memcpy' pour rendre les fonctions 'constexpr'
23
24/*---------------------------------------------------------------------------*/
25/*---------------------------------------------------------------------------*/
26
27namespace Arccore::impl
28{
29
30// The following Float16_t conversions are based on the code from
31// Eigen library.
32
33// The conversion routines are Copyright (c) Fabian Giesen, 2016.
34// The original license follows:
35//
36// Copyright (c) Fabian Giesen, 2016
37// All rights reserved.
38// Redistribution and use in source and binary forms, with or without
39// modification, are permitted.
40// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51
52namespace detail
53{
55 {
56 unsigned int u;
57 float f;
58 };
59} // namespace detail
60
61inline constexpr uint16_t
62convertFloat16ToUint16Impl(float v)
63{
65 f.f = v;
66
67 constexpr detail::float32_bits f32infty = { 255 << 23 };
68 constexpr detail::float32_bits f16max = { (127 + 16) << 23 };
69 constexpr detail::float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
70 constexpr unsigned int sign_mask = 0x80000000u;
71 uint16_t val = static_cast<uint16_t>(0x0u);
72
73 unsigned int sign = f.u & sign_mask;
74 f.u ^= sign;
75
76 // NOTE all the integer compares in this function can be safely
77 // compiled into signed compares since all operands are below
78 // 0x80000000. Important if you want fast straight SSE2 code
79 // (since there's no unsigned PCMPGTD).
80
81 if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
82 val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
83 }
84 else { // (De)normalized number or zero
85 if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
86 // use a magic value to align our 10 mantissa bits at the bottom of
87 // the float. as long as FP addition is round-to-nearest-even this
88 // just works.
89 f.f += denorm_magic.f;
90
91 // and one integer subtract of the bias later, we have our final float!
92 val = static_cast<uint16_t>(f.u - denorm_magic.u);
93 }
94 else {
95 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
96
97 // update exponent, rounding bias part 1
98 // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
99 // without arithmetic overflow.
100 f.u += 0xc8000fffU;
101 // rounding bias part 2
102 f.u += mant_odd;
103 // take the bits!
104 val = static_cast<uint16_t>(f.u >> 13);
105 }
106 }
107
108 val |= static_cast<uint16_t>(sign >> 16);
109 return val;
110}
111
112/*---------------------------------------------------------------------------*/
113/*---------------------------------------------------------------------------*/
114
115inline float
116convertToFloat16Impl(uint16_t val)
117{
118 constexpr detail::float32_bits magic = { 113 << 23 };
119 constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
120 detail::float32_bits o{};
121
122 o.u = (val & 0x7fff) << 13; // exponent/mantissa bits
123 unsigned int exp = shifted_exp & o.u; // just the exponent
124 o.u += (127 - 15) << 23; // exponent adjust
125
126 // handle exponent special cases
127 if (exp == shifted_exp) { // Inf/NaN?
128 o.u += (128 - 16) << 23; // extra exp adjust
129 }
130 else if (exp == 0) { // Zero/Denormal?
131 o.u += 1 << 23; // extra exp adjust
132 o.f -= magic.f; // re-normalize
133 }
134
135 // original code:
136 o.u |= (val & 0x8000U) << 16U; // sign bit
137
138 return o.f;
139}
140
141/*---------------------------------------------------------------------------*/
142/*---------------------------------------------------------------------------*/
143
144// Converti en appliquant l'arrondi au plus proche (round-to-nearest)
145inline uint16_t
146convertBFloat16ToUint16Impl(float v)
147{
148 uint32_t input = 0;
149 std::memcpy(&input, &v, sizeof(uint32_t));
150 // Least significant bit of resulting bfloat.
151 uint32_t lsb = (input >> 16) & 1;
152 uint32_t rounding_bias = 0x7fff + lsb;
153 input += rounding_bias;
154 uint16_t output = static_cast<uint16_t>(input >> 16);
155 return output;
156}
157
158/*---------------------------------------------------------------------------*/
159/*---------------------------------------------------------------------------*/
160
161inline float
162convertToBFloat16Impl(uint16_t val)
163{
164 float result;
165 char* const first = reinterpret_cast<char*>(&result);
166 char* const second = first + sizeof(uint16_t);
167 // Les macros suivantes ne sont pas définies sous Windows mais ce dernier
168 // ne supporte que des architectures big-endian donc cela ne pose pas
169 // de problèmes.
170#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
171 std::memcpy(first, &val, sizeof(uint16_t));
172 std::memset(second, 0, sizeof(uint16_t));
173#else
174 std::memset(first, 0, sizeof(uint16_t));
175 std::memcpy(second, &val, sizeof(uint16_t));
176#endif
177 return result;
178}
179
180/*---------------------------------------------------------------------------*/
181/*---------------------------------------------------------------------------*/
182
183} // namespace Arccore::impl
184
185/*---------------------------------------------------------------------------*/
186/*---------------------------------------------------------------------------*/
187
188#endif
__host__ __device__ double exp(double v)
Exponentielle de v.
Definition Math.h:116