Merge pull request #480 from lioncash/info
common/fp/info: Make half-precision info struct functions return correctly sized types
This commit is contained in:
commit
2d3aa9b8fb
10 changed files with 44 additions and 33 deletions
|
@ -20,21 +20,32 @@ struct FPInfo<u16> {
|
||||||
static constexpr size_t exponent_width = 5;
|
static constexpr size_t exponent_width = 5;
|
||||||
static constexpr size_t explicit_mantissa_width = 10;
|
static constexpr size_t explicit_mantissa_width = 10;
|
||||||
static constexpr size_t mantissa_width = explicit_mantissa_width + 1;
|
static constexpr size_t mantissa_width = explicit_mantissa_width + 1;
|
||||||
|
|
||||||
static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width;
|
static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width;
|
||||||
static constexpr u32 sign_mask = 0x8000;
|
static constexpr u32 sign_mask = 0x8000;
|
||||||
static constexpr u32 exponent_mask = 0x7C00;
|
static constexpr u32 exponent_mask = 0x7C00;
|
||||||
static constexpr u32 mantissa_mask = 0x3FF;
|
static constexpr u32 mantissa_mask = 0x3FF;
|
||||||
static constexpr u32 mantissa_msb = 0x200;
|
static constexpr u32 mantissa_msb = 0x200;
|
||||||
|
|
||||||
static constexpr int exponent_min = -14;
|
static constexpr int exponent_min = -14;
|
||||||
static constexpr int exponent_max = 15;
|
static constexpr int exponent_max = 15;
|
||||||
static constexpr int exponent_bias = 15;
|
static constexpr int exponent_bias = 15;
|
||||||
|
|
||||||
static constexpr u32 Zero(bool sign) { return sign ? sign_mask : 0; }
|
static constexpr u16 Zero(bool sign) {
|
||||||
static constexpr u32 Infinity(bool sign) { return exponent_mask | Zero(sign); }
|
return sign ? static_cast<u16>(sign_mask) : u16{0};
|
||||||
static constexpr u32 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
|
}
|
||||||
static constexpr u32 DefaultNaN() { return exponent_mask | (u32(1) << (explicit_mantissa_width - 1)); }
|
|
||||||
|
static constexpr u16 Infinity(bool sign) {
|
||||||
|
return static_cast<u16>(exponent_mask | Zero(sign));
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr u16 MaxNormal(bool sign) {
|
||||||
|
return static_cast<u16>((exponent_mask - 1) | Zero(sign));
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr u16 DefaultNaN() {
|
||||||
|
return static_cast<u16>(exponent_mask | (u32(1) << (explicit_mantissa_width - 1)));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
@ -88,7 +99,7 @@ struct FPInfo<u64> {
|
||||||
template<typename FPT, bool sign, int exponent, FPT value>
|
template<typename FPT, bool sign, int exponent, FPT value>
|
||||||
constexpr FPT FPValue() {
|
constexpr FPT FPValue() {
|
||||||
if constexpr (value == 0) {
|
if constexpr (value == 0) {
|
||||||
return FPT(FPInfo<FPT>::Zero(sign));
|
return FPInfo<FPT>::Zero(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr int point_position = static_cast<int>(FPInfo<FPT>::explicit_mantissa_width);
|
constexpr int point_position = static_cast<int>(FPInfo<FPT>::explicit_mantissa_width);
|
||||||
|
|
|
@ -35,7 +35,7 @@ FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
|
|
||||||
if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) {
|
if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) {
|
||||||
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
||||||
return FPT(FPInfo<FPT>::DefaultNaN());
|
return FPInfo<FPT>::DefaultNaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maybe_nan) {
|
if (maybe_nan) {
|
||||||
|
@ -50,25 +50,25 @@ FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
// Raise NaN on (inf * inf) of opposite signs or (inf * zero).
|
// Raise NaN on (inf * inf) of opposite signs or (inf * zero).
|
||||||
if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) {
|
if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) {
|
||||||
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
||||||
return FPT(FPInfo<FPT>::DefaultNaN());
|
return FPInfo<FPT>::DefaultNaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle infinities
|
// Handle infinities
|
||||||
if ((infA && !signA) || (infP && !signP)) {
|
if ((infA && !signA) || (infP && !signP)) {
|
||||||
return FPT(FPInfo<FPT>::Infinity(false));
|
return FPInfo<FPT>::Infinity(false);
|
||||||
}
|
}
|
||||||
if ((infA && signA) || (infP && signP)) {
|
if ((infA && signA) || (infP && signP)) {
|
||||||
return FPT(FPInfo<FPT>::Infinity(true));
|
return FPInfo<FPT>::Infinity(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Result is exactly zero
|
// Result is exactly zero
|
||||||
if (zeroA && zeroP && signA == signP) {
|
if (zeroA && zeroP && signA == signP) {
|
||||||
return FPT(FPInfo<FPT>::Zero(signA));
|
return FPInfo<FPT>::Zero(signA);
|
||||||
}
|
}
|
||||||
|
|
||||||
const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2);
|
const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2);
|
||||||
if (result_value.mantissa == 0) {
|
if (result_value.mantissa == 0) {
|
||||||
return FPT(FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity));
|
return FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity);
|
||||||
}
|
}
|
||||||
return FPRound<FPT>(result_value, fpcr, fpsr);
|
return FPRound<FPT>(result_value, fpcr, fpsr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,16 +27,16 @@ FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
|
|
||||||
if (type == FPType::Zero) {
|
if (type == FPType::Zero) {
|
||||||
FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
|
FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
|
||||||
return FPT(FPInfo<FPT>::Infinity(sign));
|
return FPInfo<FPT>::Infinity(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sign) {
|
if (sign) {
|
||||||
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
||||||
return FPT(FPInfo<FPT>::DefaultNaN());
|
return FPInfo<FPT>::DefaultNaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == FPType::Infinity) {
|
if (type == FPType::Infinity) {
|
||||||
return FPT(FPInfo<FPT>::Zero(false));
|
return FPInfo<FPT>::Zero(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int result_exponent = (-(value.exponent + 1)) >> 1;
|
const int result_exponent = (-(value.exponent + 1)) >> 1;
|
||||||
|
|
|
@ -37,7 +37,7 @@ FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inf1 || inf2) {
|
if (inf1 || inf2) {
|
||||||
return FPT(FPInfo<FPT>::Infinity(sign1 != sign2));
|
return FPInfo<FPT>::Infinity(sign1 != sign2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// result_value = (3.0 + (value1 * value2)) / 2.0
|
// result_value = (3.0 + (value1 * value2)) / 2.0
|
||||||
|
@ -45,7 +45,7 @@ FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
result_value.exponent--;
|
result_value.exponent--;
|
||||||
|
|
||||||
if (result_value.mantissa == 0) {
|
if (result_value.mantissa == 0) {
|
||||||
return FPT(FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity));
|
return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
|
||||||
}
|
}
|
||||||
return FPRound<FPT>(result_value, fpcr, fpsr);
|
return FPRound<FPT>(result_value, fpcr, fpsr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,12 +31,12 @@ FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == FPType::Infinity) {
|
if (type == FPType::Infinity) {
|
||||||
return FPT(FPInfo<FPT>::Zero(sign));
|
return FPInfo<FPT>::Zero(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == FPType::Zero) {
|
if (type == FPType::Zero) {
|
||||||
FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
|
FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
|
||||||
return FPT(FPInfo<FPT>::Infinity(sign));
|
return FPInfo<FPT>::Infinity(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value.exponent < FPInfo<FPT>::exponent_min - 2) {
|
if (value.exponent < FPInfo<FPT>::exponent_min - 2) {
|
||||||
|
@ -58,13 +58,13 @@ FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
|
|
||||||
FPProcessException(FPExc::Overflow, fpcr, fpsr);
|
FPProcessException(FPExc::Overflow, fpcr, fpsr);
|
||||||
FPProcessException(FPExc::Inexact, fpcr, fpsr);
|
FPProcessException(FPExc::Inexact, fpcr, fpsr);
|
||||||
return overflow_to_inf ? FPT(FPInfo<FPT>::Infinity(sign)) : FPT(FPInfo<FPT>::MaxNormal(sign));
|
return overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((fpcr.FZ() && !std::is_same_v<FPT, u16>) || (fpcr.FZ16() && std::is_same_v<FPT, u16>)) {
|
if ((fpcr.FZ() && !std::is_same_v<FPT, u16>) || (fpcr.FZ16() && std::is_same_v<FPT, u16>)) {
|
||||||
if (value.exponent >= -FPInfo<FPT>::exponent_min) {
|
if (value.exponent >= -FPInfo<FPT>::exponent_min) {
|
||||||
fpsr.UFC(true);
|
fpsr.UFC(true);
|
||||||
return FPT(FPInfo<FPT>::Zero(sign));
|
return FPInfo<FPT>::Zero(sign);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const FPT bits_sign = FPT(FPInfo<FPT>::Zero(sign));
|
const FPT bits_sign = FPInfo<FPT>::Zero(sign);
|
||||||
const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias);
|
const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias);
|
||||||
const FPT bits_mantissa = static_cast<FPT>(estimate);
|
const FPT bits_mantissa = static_cast<FPT>(estimate);
|
||||||
return FPT((bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask) | bits_sign);
|
return FPT((bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask) | bits_sign);
|
||||||
|
|
|
@ -38,7 +38,7 @@ FPT FPRecipExponent(FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
return FPProcessNaN(type, op, fpcr, fpsr);
|
return FPProcessNaN(type, op, fpcr, fpsr);
|
||||||
}
|
}
|
||||||
|
|
||||||
const FPT sign_bits = FPT(FPInfo<FPT>::Zero(sign));
|
const FPT sign_bits = FPInfo<FPT>::Zero(sign);
|
||||||
const FPT exponent = DetermineExponentValue<FPT>(op);
|
const FPT exponent = DetermineExponentValue<FPT>(op);
|
||||||
|
|
||||||
// Zero and denormals
|
// Zero and denormals
|
||||||
|
|
|
@ -37,14 +37,14 @@ FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inf1 || inf2) {
|
if (inf1 || inf2) {
|
||||||
return FPT(FPInfo<FPT>::Infinity(sign1 != sign2));
|
return FPInfo<FPT>::Infinity(sign1 != sign2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// result_value = 2.0 + (value1 * value2)
|
// result_value = 2.0 + (value1 * value2)
|
||||||
const FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2);
|
const FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2);
|
||||||
|
|
||||||
if (result_value.mantissa == 0) {
|
if (result_value.mantissa == 0) {
|
||||||
return FPT(FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity));
|
return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
|
||||||
}
|
}
|
||||||
return FPRound<FPT>(result_value, fpcr, fpsr);
|
return FPRound<FPT>(result_value, fpcr, fpsr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,11 +31,11 @@ u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == FPType::Infinity) {
|
if (type == FPType::Infinity) {
|
||||||
return FPT(FPInfo<FPT>::Infinity(sign));
|
return FPInfo<FPT>::Infinity(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == FPType::Zero) {
|
if (type == FPType::Zero) {
|
||||||
return FPT(FPInfo<FPT>::Zero(sign));
|
return FPInfo<FPT>::Zero(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reshift decimal point back to bit zero.
|
// Reshift decimal point back to bit zero.
|
||||||
|
@ -79,7 +79,7 @@ u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr)
|
||||||
const u64 abs_int_result = new_sign ? Safe::Negate<u64>(int_result) : static_cast<u64>(int_result);
|
const u64 abs_int_result = new_sign ? Safe::Negate<u64>(int_result) : static_cast<u64>(int_result);
|
||||||
|
|
||||||
const FPT result = int_result == 0
|
const FPT result = int_result == 0
|
||||||
? FPT(FPInfo<FPT>::Zero(sign))
|
? FPInfo<FPT>::Zero(sign)
|
||||||
: FPRound<FPT>(FPUnpacked{new_sign, normalized_point_position, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr);
|
: FPRound<FPT>(FPUnpacked{new_sign, normalized_point_position, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr);
|
||||||
|
|
||||||
if (error != ResidualError::Zero && exact) {
|
if (error != ResidualError::Zero && exact) {
|
||||||
|
|
|
@ -31,7 +31,7 @@ FPT FPProcessNaN(FPType type, FPT op, FPCR fpcr, FPSR& fpsr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fpcr.DN()) {
|
if (fpcr.DN()) {
|
||||||
result = FPT(FPInfo<FPT>::DefaultNaN());
|
result = FPInfo<FPT>::DefaultNaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -90,7 +90,7 @@ FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
|
||||||
|
|
||||||
if (((!isFP16 && fpcr.FZ()) || (isFP16 && fpcr.FZ16())) && exponent < minimum_exp) {
|
if (((!isFP16 && fpcr.FZ()) || (isFP16 && fpcr.FZ16())) && exponent < minimum_exp) {
|
||||||
fpsr.UFC(true);
|
fpsr.UFC(true);
|
||||||
return FPT(FPInfo<FPT>::Zero(sign));
|
return FPInfo<FPT>::Zero(sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
int biased_exp = std::max<int>(exponent - minimum_exp + 1, 0);
|
int biased_exp = std::max<int>(exponent - minimum_exp + 1, 0);
|
||||||
|
@ -153,7 +153,7 @@ FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
|
||||||
#endif
|
#endif
|
||||||
constexpr int max_biased_exp = (1 << E) - 1;
|
constexpr int max_biased_exp = (1 << E) - 1;
|
||||||
if (biased_exp >= max_biased_exp) {
|
if (biased_exp >= max_biased_exp) {
|
||||||
result = overflow_to_inf ? FPT(FPInfo<FPT>::Infinity(sign)) : FPT(FPInfo<FPT>::MaxNormal(sign));
|
result = overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign);
|
||||||
FPProcessException(FPExc::Overflow, fpcr, fpsr);
|
FPProcessException(FPExc::Overflow, fpcr, fpsr);
|
||||||
FPProcessException(FPExc::Inexact, fpcr, fpsr);
|
FPProcessException(FPExc::Inexact, fpcr, fpsr);
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in a new issue