#include #include #include #include #include #include #define BIN32_SIGN __BIT(31) #define BIN32_EXP __BITS(23,31) #define BIN32_EXP_MIN -126 #define BIN32_EXP_MAX 127 #define BIN32_EXP_BIAS BIN32_EXP_MAX #define BIN32_EXP_SUBNORMAL (BIN32_EXP_MIN - 1) #define BIN32_EXP_INFNAN (BIN32_EXP_MAX + 1) #define BIN32_TS __BITS(0,22) /* trailing significand field */ #define BIN32_PREC 23 #define BIN64_SIGN __BIT(63) #define BIN64_EXP __BITS(52,62) #define BIN64_EXP_MIN -1022 #define BIN64_EXP_MAX 1023 #define BIN64_EXP_BIAS BIN64_EXP_MAX #define BIN64_EXP_SUBNORMAL (BIN64_EXP_MIN - 1) #define BIN64_EXP_INFNAN (BIN64_EXP_MAX + 1) #define BIN64_TS __BITS(0,51) /* trailing significand field */ #define BIN64_PREC 52 static int32_t convert32(uint32_t f) { int32_t s; uint32_t t; uint64_t u; int e; /* map subnormal/nan to zero */ e = (int)__SHIFTOUT(f, BIN32_EXP) - BIN32_EXP_BIAS; if (e == BIN32_EXP_SUBNORMAL || (e == BIN32_EXP_INFNAN && __SHIFTOUT(f, BIN32_TS) != 0)) return 0; /* copy sign */ s = (f & BIN32_SIGN ? -1 : +1); f &= ~BIN32_SIGN; /* saturate at 1.0 */ f = MIN(f, 0x3f800000u); /* f = 2^e (1 + t/2^p) */ e = (int)__SHIFTOUT(f, BIN32_EXP) - BIN32_EXP_BIAS; assert(e <= 0); t = __SHIFTOUT(f, BIN32_TS); /* * INT_MAX * f = INT_MAX * 2^e (1 + t/2^p) * = INT_MAX * 2^{e - p} (2^p + t) * = (INT_MAX * (2^p + t)) / 2^{p - e} */ u = (uint32_t)1 << BIN32_PREC; u |= t; u *= INT32_MAX; u >>= BIN32_PREC - e; return s * (int32_t)u; } static int32_t convert64(uint64_t f) { int32_t s; uint64_t t; uint64_t u; int e; /* map subnormal/nan to zero */ e = (int)__SHIFTOUT(f, BIN64_EXP) - BIN64_EXP_BIAS; if (e == BIN64_EXP_SUBNORMAL || (e == BIN64_EXP_INFNAN && __SHIFTOUT(f, BIN64_TS) != 0)) return 0; /* copy sign */ s = (f & BIN64_SIGN ? -1 : +1); f &= ~BIN64_SIGN; /* saturate at 1.0 */ f = MIN(f, (uint64_t)0x3ff0000000000000); /* f = 2^e (1 + t/2^p) */ e = (int)__SHIFTOUT(f, BIN64_EXP) - BIN64_EXP_BIAS; assert(e <= 0); t = __SHIFTOUT(f, BIN64_TS); /* * INT_MAX * f = INT_MAX * 2^e (1 + t/2^p) * = INT_MAX * 2^{e - p} (2^p + t) * = (INT_MAX * (2^p + t)) / 2^{p - e} * ~ (INT_MAX * floor((2^p + t)/2^21)) / 2^{p - 21 - e} * * Dividing by 2^21 avoids overflow in INT_MAX multiply, since * 2^p + t < 2^53 so floor((2^p + t)/2^21) < 2^32. We lose * some accuracy this way in order to avoid requiring >64-bit * integer arithmetic (as a floating-point unit would use * internally), but not likely a big deal. */ u = (uint64_t)1 << BIN64_PREC; u |= t; u >>= 21; u *= INT32_MAX; u >>= BIN64_PREC - 21 - e; return s * (int32_t)u; } static size_t float32_to_linear32(const void *inbuf, void *outbuf, size_t len) { const uint8_t *inbuf8 = inbuf, *end = inbuf8 + len; uint8_t *outbuf8 = outbuf; while (inbuf8 + 4 <= end) { le32enc(outbuf8, convert32(le32dec(inbuf8))); inbuf8 += 4; outbuf8 += 4; } } static size_t float64_to_linear32(const void *inbuf, void *outbuf, size_t len) { const uint8_t *inbuf8 = inbuf, *end = inbuf8 + len; uint8_t *outbuf8 = outbuf; while (inbuf8 + 8 <= end) { le64enc(outbuf8, convert64(le64dec(inbuf8))); inbuf8 += 4; outbuf8 += 4; } }