Skip to content

Instantly share code, notes, and snippets.

@rjeschke
Created June 22, 2012 09:37

Revisions

  1. rjeschke created this gist Jun 22, 2012.
    407 changes: 407 additions & 0 deletions Float128.java
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,407 @@
    /*
    * Copyright (C) 2012 René Jeschke <[email protected]>
    *
    * Licensed under the Apache License, Version 2.0 (the "License");
    * you may not use this file except in compliance with the License.
    * You may obtain a copy of the License at
    *
    * http://www.apache.org/licenses/LICENSE-2.0
    *
    * Unless required by applicable law or agreed to in writing, software
    * distributed under the License is distributed on an "AS IS" BASIS,
    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    * See the License for the specific language governing permissions and
    * limitations under the License.
    */
    package jfloat;

    import java.math.BigDecimal;
    import java.math.BigInteger;

    /* This thing is WIP. Currently basic mul/div is working, without
    * proper rounding, though. Everything is quick'n'dirty ...
    *
    * Proof-of-concept and playground.
    *
    * 128 Bit: (4 * UI32)
    * 1 sign
    * 15 bit exponent, bias: 16383
    * 112 mantissa
    */

    public class Float128
    {
    private final static BigDecimal BD_2 = new BigDecimal(2);
    private final static BigDecimal MAN_DIV = new BigDecimal(new BigInteger("10000000000000000000000000000", 16));

    public static int getSignBit(int[] num)
    {
    return num[0] & 0x80000000;
    }

    public static int getExponent(int[] num)
    {
    return ((num[0] >> 16) & 0x7fff) - 16383;
    }

    private static void carryAdd4(int[] a, int carry)
    {
    int c = carry;
    if(c == 0)
    return;
    for(int i = 3; i >= 0; i--)
    {
    long t = (a[i] & 0xffffffffL) + c;
    a[i] = (int)t;
    c = (int)(t >> 32);
    if(c == 0)
    return;
    }
    }

    private static int getSetBit8(int t)
    {
    if((t & 0x80) != 0)
    return 8;
    if((t & 0x40) != 0)
    return 7;
    if((t & 0x20) != 0)
    return 6;
    if((t & 0x10) != 0)
    return 5;
    if((t & 0x08) != 0)
    return 4;
    if((t & 0x04) != 0)
    return 3;
    if((t & 0x02) != 0)
    return 2;
    if((t & 0x01) != 0)
    return 1;
    return 0;
    }

    private static int getSetBit(int t)
    {
    if(t == 0)
    return 0;
    if((t & 0xff000000) != 0)
    return getSetBit8(t >> 24) + 24;
    if((t & 0x00ff0000) != 0)
    return getSetBit8(t >> 16) + 16;
    if((t & 0x0000ff00) != 0)
    return getSetBit8(t >> 8) + 8;
    if((t & 0x000000ff) != 0)
    return getSetBit8(t);
    return 0;
    }

    public static int[] div(int[] a, int[] b, int[] r)
    {
    final int sign = (a[0] ^ b[0]) & 0x80000000;
    final int e0 = (a[0] >> 16) & 0x7fff;
    final int e1 = (b[0] >> 16) & 0x7fff;

    // We flush denormals automatically
    if(e0 == 0)
    {
    r[0] = r[1] = r[2] = r[3] = 0;
    return r;
    }

    // For now we throw up
    if(e1 == 0)
    {
    throw new ArithmeticException("The mighty Neet did not yet define division by zero");
    }

    // k is either 7 or 8 (2^k bits of precision, so 7 should be enough)
    // N = mant(a) >> 1
    // D = mand(b) >> 1
    // e = 1 - D
    // Q = N
    // for(i=0, k-1)
    // Q = Q + Q * e
    // e = e * e
    // end

    int[] e = new int[4];
    int[] q = new int[4];

    // 8.120 bit fixed point
    e[0] = ((((b[0] & 0xffff) | 0x10000) << 7) | (b[1] >>> 25)) ^ 0xffffff;
    e[1] = ((b[1] << 7) | (b[2] >>> 25)) ^ -1;
    e[2] = ((b[2] << 7) | (b[3] >>> 25)) ^ -1;
    e[3] = (b[3] << 7) ^ -1;
    carryAdd4(e, 1);

    q[0] = (((a[0] & 0xffff) | 0x10000) << 7) | (a[1] >>> 25);
    q[1] = (a[1] << 7) | (a[2] >>> 25);
    q[2] = (a[2] << 7) | (a[3] >>> 25);
    q[3] = (a[3] << 7);

    int[] ret = new int[8];
    for(int n = 0; n < 7; n++)
    {
    long t;
    mul8_120(q, e, ret);

    ret[0] = (ret[0] << 8) | (ret[1] >>> 24);
    ret[1] = (ret[1] << 8) | (ret[2] >>> 24);
    ret[2] = (ret[2] << 8) | (ret[3] >>> 24);
    ret[3] = (ret[3] << 8) | (ret[4] >>> 24);

    t = (q[3] & 0xffffffffL) + (ret[3] & 0xffffffffL);
    q[3] = (int)t;
    t = (q[2] & 0xffffffffL) + (ret[2] & 0xffffffffL) + (t >>> 32);
    q[2] = (int)t;
    t = (q[1] & 0xffffffffL) + (ret[1] & 0xffffffffL) + (t >>> 32);
    q[1] = (int)t;
    t = (q[0] & 0xffffffffL) + (ret[0] & 0xffffffffL) + (t >>> 32);
    q[0] = (int)t;

    mul8_120(e, e, ret);
    e[0] = (ret[0] << 8) | (ret[1] >>> 24);
    e[1] = (ret[1] << 8) | (ret[2] >>> 24);
    e[2] = (ret[2] << 8) | (ret[3] >>> 24);
    e[3] = (ret[3] << 8) | (ret[4] >>> 24);
    }

    final int sb = getSetBit(q[0]);
    final int exp = e0 - e1 + sb + 16383 - 25;
    if(exp <= 0)
    {
    r[0] = r[1] = r[2] = r[3] = 0;
    return r;
    }
    // TODO +/- inf
    final int s0 = sb - 17;
    final int s1 = 32 - s0;
    r[0] = sign | (exp << 16) | ((q[0] >> s0) & 0xffff);
    r[1] = (q[0] << s1) | (q[1] >>> s0);
    r[2] = (q[1] << s1) | (q[2] >>> s0);
    r[3] = (q[2] << s1) | (q[3] >>> s0);

    return r;
    }

    public static int[] mul8_120(int[] a, int[] b, int[] ret)
    {
    long t, carry, sum;

    t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
    carry = t >>> 32;
    ret[7] = (int)t;

    t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    carry += sum >>> 32;
    ret[6] = (int)sum;

    t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[5] = (int)sum;

    t = (a[0] & 0xffffffffL) * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[3] & 0xffffffffL) * (b[0] & 0xffffffffL);
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[4] = (int)sum;

    t = (a[0] & 0xffffffffL) * (b[2] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[2] & 0xffffffffL) * (b[0] & 0xffffffffL);
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[3] = (int)sum;

    t = (a[0] & 0xffffffffL) * (b[1] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * (b[0] & 0xffffffffL);
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[2] = (int)sum;

    t = (a[0] & 0xffffffffL) * (b[0] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    ret[1] = (int)sum;
    ret[0] = (int)((t >>> 32) + (sum >>> 32));

    return ret;
    }


    public static int[] mul(int[] a, int[] b, int[] r)
    {
    final int sign = (a[0] ^ b[0]) & 0x80000000;
    final int e0 = (a[0] >> 16) & 0x7fff;
    final int e1 = (b[0] >> 16) & 0x7fff;

    // We flush denormals automatically
    if(e0 == 0 || e1 == 0)
    {
    r[0] = r[1] = r[2] = r[3] = 0;
    return r;
    }

    final int[] temp = mantMul(a, b, new int[8]);

    final int s = (temp[0] >> 1);
    final int exp = e0 + e1 - 16383 + s;
    if(exp <= 0)
    {
    r[0] = r[1] = r[2] = r[3] = 0;
    return r;
    }
    // TODO +/- inf
    final int s0 = 16 - s;
    final int s1 = 32 - s0;
    r[0] = sign | (exp << 16) | (((temp[0] << s0) | (temp[1] >>> s1)) & 0xffff);
    r[1] = (temp[1] << s0) | (temp[2] >>> s1);
    r[2] = (temp[2] << s0) | (temp[3] >>> s1);
    r[3] = (temp[3] << s0) | (temp[4] >>> s1);

    return r;
    }

    private static int[] mantMul(int[] a, int[] b, int[] ret)
    {
    long t, carry, sum;

    final int a0 = (a[0] & 0xffff) | 0x10000;
    final int b0 = (b[0] & 0xffff) | 0x10000;

    t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
    carry = t >>> 32;
    ret[7] = (int)t;

    t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    carry += sum >>> 32;
    ret[6] = (int)sum;

    t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[5] = (int)sum;

    t = a0 * (b[3] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[3] & 0xffffffffL) * b0;
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[4] = (int)sum;

    t = a0 * (b[2] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
    carry += t >>> 32;
    sum += t & 0xffffffffL;
    t = (a[2] & 0xffffffffL) * b0;
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[3] = (int)sum;

    t = a0 * (b[1] & 0xffffffffL);
    sum = (t & 0xffffffffL) + carry;
    carry = t >>> 32;
    t = (a[1] & 0xffffffffL) * b0;
    sum += t & 0xffffffffL;
    carry += (t >>> 32) + (sum >>> 32);
    ret[2] = (int)sum;

    t = (long)a0 * (long)b0;
    sum = (t & 0xffffffffL) + carry;
    ret[1] = (int)sum;
    ret[0] = (int)((t >>> 32) + (sum >>> 32));

    return ret;
    }

    public static int[] fromDouble(double v, int[] n)
    {
    long bits = Double.doubleToRawLongBits(v);

    int sign = (int)((bits >> 63) & 1);
    int exp = ((int)(bits >> 52) & 2047) - 1023;
    int m0 = (int)(bits >> 36) & 0xffff;
    int m1 = (int)(bits >> 4);
    int m2 = (int)(bits & 15) << 28;

    n[0] = (sign << 31) | ((exp + 16383) << 16) | m0;
    n[1] = m1;
    n[2] = m2;
    n[3] = 0;

    return n;
    }

    private static String mantissaToHex(int[] n)
    {
    return String.format(n[0] < 0 ? "-%x%08x%08x%08x" : "%x%08x%08x%08x", (n[0] & 0xffff) | 0x10000, n[1], n[2], n[3]);
    }

    public static BigDecimal toBigDecimal(int[] n)
    {
    final BigDecimal mant = new BigDecimal(new BigInteger(mantissaToHex(n), 16)).divide(MAN_DIV);
    int exp = getExponent(n);
    final BigDecimal em = exp < 0 ? BigDecimal.ONE.divide(BD_2.pow(-exp)) : BD_2.pow(exp);
    return mant.multiply(em);
    }

    public static String toString(int[] n)
    {
    if((n[0] & 0x7fff0000) == 0)
    return "0";
    return toBigDecimal(n).toString();
    }

    public static String toHexString(int[] n)
    {
    StringBuilder sb = new StringBuilder();
    for(int i = 0; i < n.length; i++)
    sb.append(String.format(i == 0 ? "%08x" : "%08x", n[i]));
    return sb.toString();
    }
    }