rjeschke · June 22, 2012 09:37 · Jun 22, 2012
diff --git a/Float128.java b/Float128.java
@@ -0,0 +1,407 @@
+/*
+ * Copyright (C) 2012 René Jeschke <[email protected]>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package jfloat;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+
+/* This thing is WIP. Currently basic mul/div is working, without
+ * proper rounding, though. Everything is quick'n'dirty ...
+ * 
+ * Proof-of-concept and playground.
+ * 
+ * 128 Bit: (4 * UI32)
+ *     1 sign
+ *     15 bit exponent, bias: 16383
+ *     112 mantissa
+ */
+
+public class Float128
+{
+    private final static BigDecimal BD_2 = new BigDecimal(2);
+    private final static BigDecimal MAN_DIV = new BigDecimal(new BigInteger("10000000000000000000000000000", 16));
+
+    public static int getSignBit(int[] num)
+    {
+        return num[0] & 0x80000000;
+    }
+
+    public static int getExponent(int[] num)
+    {
+        return ((num[0] >> 16) & 0x7fff) - 16383;
+    }
+
+    private static void carryAdd4(int[] a, int carry)
+    {
+        int c = carry;
+        if(c == 0)
+            return;
+        for(int i = 3; i >= 0; i--)
+        {
+            long t = (a[i] & 0xffffffffL) + c;
+            a[i] = (int)t;
+            c = (int)(t >> 32);
+            if(c == 0)
+                return;
+        }
+    }
+
+    private static int getSetBit8(int t)
+    {
+        if((t & 0x80) != 0)
+            return 8;
+        if((t & 0x40) != 0)
+            return 7;
+        if((t & 0x20) != 0)
+            return 6;
+        if((t & 0x10) != 0)
+            return 5;
+        if((t & 0x08) != 0)
+            return 4;
+        if((t & 0x04) != 0)
+            return 3;
+        if((t & 0x02) != 0)
+            return 2;
+        if((t & 0x01) != 0)
+            return 1;
+        return 0;
+    }
+
+    private static int getSetBit(int t)
+    {
+        if(t == 0)
+            return 0;
+        if((t & 0xff000000) != 0)
+            return getSetBit8(t >> 24) + 24;
+        if((t & 0x00ff0000) != 0)
+            return getSetBit8(t >> 16) + 16;
+        if((t & 0x0000ff00) != 0)
+            return getSetBit8(t >> 8) + 8;
+        if((t & 0x000000ff) != 0)
+            return getSetBit8(t);
+        return 0;
+    }
+
+    public static int[] div(int[] a, int[] b, int[] r)
+    {
+        final int sign = (a[0] ^ b[0]) & 0x80000000;
+        final int e0 = (a[0] >> 16) & 0x7fff;
+        final int e1 = (b[0] >> 16) & 0x7fff;
+
+        // We flush denormals automatically
+        if(e0 == 0)
+        {
+            r[0] = r[1] = r[2] = r[3] = 0;
+            return r;
+        }
+
+        // For now we throw up
+        if(e1 == 0)
+        {
+            throw new ArithmeticException("The mighty Neet did not yet define division by zero");
+        }
+
+        // k is either 7 or 8 (2^k bits of precision, so 7 should be enough) 
+        // N = mant(a) >> 1
+        // D = mand(b) >> 1
+        // e = 1 - D
+        // Q = N
+        // for(i=0, k-1)
+        //  Q = Q + Q * e
+        //  e = e * e
+        // end
+
+        int[] e = new int[4];
+        int[] q = new int[4];
+
+        // 8.120 bit fixed point
+        e[0] = ((((b[0] & 0xffff) | 0x10000) << 7) | (b[1] >>> 25)) ^ 0xffffff;
+        e[1] = ((b[1] << 7) | (b[2] >>> 25)) ^ -1; 
+        e[2] = ((b[2] << 7) | (b[3] >>> 25)) ^ -1; 
+        e[3] = (b[3] << 7) ^ -1;
+        carryAdd4(e, 1);
+
+        q[0] = (((a[0] & 0xffff) | 0x10000) << 7) | (a[1] >>> 25);
+        q[1] = (a[1] << 7) | (a[2] >>> 25); 
+        q[2] = (a[2] << 7) | (a[3] >>> 25); 
+        q[3] = (a[3] << 7);
+
+        int[] ret = new int[8];
+        for(int n = 0; n < 7; n++)
+        {
+            long t;
+            mul8_120(q, e, ret);
+
+            ret[0] = (ret[0] << 8) | (ret[1] >>> 24);
+            ret[1] = (ret[1] << 8) | (ret[2] >>> 24);
+            ret[2] = (ret[2] << 8) | (ret[3] >>> 24);
+            ret[3] = (ret[3] << 8) | (ret[4] >>> 24);
+
+            t = (q[3] & 0xffffffffL) + (ret[3] & 0xffffffffL);
+            q[3] = (int)t;
+            t = (q[2] & 0xffffffffL) + (ret[2] & 0xffffffffL) + (t >>> 32);
+            q[2] = (int)t;
+            t = (q[1] & 0xffffffffL) + (ret[1] & 0xffffffffL) + (t >>> 32);
+            q[1] = (int)t;
+            t = (q[0] & 0xffffffffL) + (ret[0] & 0xffffffffL) + (t >>> 32);
+            q[0] = (int)t;
+
+            mul8_120(e, e, ret);
+            e[0] = (ret[0] << 8) | (ret[1] >>> 24);
+            e[1] = (ret[1] << 8) | (ret[2] >>> 24);
+            e[2] = (ret[2] << 8) | (ret[3] >>> 24);
+            e[3] = (ret[3] << 8) | (ret[4] >>> 24);
+        }
+
+        final int sb = getSetBit(q[0]);
+        final int exp = e0 - e1 + sb + 16383 - 25;
+        if(exp <= 0)
+        {
+            r[0] = r[1] = r[2] = r[3] = 0;
+            return r;
+        }
+        // TODO +/- inf
+        final int s0 = sb - 17;
+        final int s1 = 32 - s0;
+        r[0] = sign | (exp << 16) | ((q[0] >> s0) & 0xffff);
+        r[1] = (q[0] << s1) | (q[1] >>> s0);
+        r[2] = (q[1] << s1) | (q[2] >>> s0);
+        r[3] = (q[2] << s1) | (q[3] >>> s0);
+
+        return r;
+    }
+
+    public static int[] mul8_120(int[] a, int[] b, int[] ret)
+    {
+        long t, carry, sum;
+
+        t       = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        carry   = t >>> 32;
+        ret[7]  = (int)t;
+
+        t       = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        carry  += sum >>> 32;
+        ret[6]  = (int)sum;
+
+        t       = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[5]  = (int)sum;
+
+        t       = (a[0] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[3] & 0xffffffffL) * (b[0] & 0xffffffffL);
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[4]  = (int)sum;
+
+        t       = (a[0] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[2] & 0xffffffffL) * (b[0] & 0xffffffffL);
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[3]  = (int)sum;
+
+        t       = (a[0] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * (b[0] & 0xffffffffL);
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[2]  = (int)sum;
+
+        t       = (a[0] & 0xffffffffL) * (b[0] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        ret[1]  = (int)sum;
+        ret[0]  = (int)((t >>> 32) + (sum >>> 32));
+
+        return ret;
+    }
+
+
+    public static int[] mul(int[] a, int[] b, int[] r)
+    {
+        final int sign = (a[0] ^ b[0]) & 0x80000000;
+        final int e0 = (a[0] >> 16) & 0x7fff;
+        final int e1 = (b[0] >> 16) & 0x7fff;
+
+        // We flush denormals automatically
+        if(e0 == 0 || e1 == 0)
+        {
+            r[0] = r[1] = r[2] = r[3] = 0;
+            return r;
+        }
+
+        final int[] temp = mantMul(a, b, new int[8]);
+
+        final int s = (temp[0] >> 1);
+        final int exp = e0 + e1 - 16383 + s;
+        if(exp <= 0)
+        {
+            r[0] = r[1] = r[2] = r[3] = 0;
+            return r;
+        }
+        // TODO +/- inf
+        final int s0 = 16 - s;
+        final int s1 = 32 - s0;
+        r[0] = sign | (exp << 16) | (((temp[0] << s0) | (temp[1] >>> s1)) & 0xffff);
+        r[1] = (temp[1] << s0) | (temp[2] >>> s1);
+        r[2] = (temp[2] << s0) | (temp[3] >>> s1);
+        r[3] = (temp[3] << s0) | (temp[4] >>> s1);
+
+        return r;
+    }
+
+    private static int[] mantMul(int[] a, int[] b, int[] ret)
+    {
+        long t, carry, sum;
+
+        final int a0 = (a[0] & 0xffff) | 0x10000;
+        final int b0 = (b[0] & 0xffff) | 0x10000;
+
+        t       = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        carry   = t >>> 32;
+        ret[7]  = (int)t;
+
+        t       = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        carry  += sum >>> 32;
+        ret[6]  = (int)sum;
+
+        t       = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[5]  = (int)sum;
+
+        t       = a0 * (b[3] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[3] & 0xffffffffL) * b0;
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[4]  = (int)sum;
+
+        t       = a0 * (b[2] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL);
+        carry  += t >>> 32;
+        sum    += t & 0xffffffffL;
+        t       = (a[2] & 0xffffffffL) * b0;
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[3]  = (int)sum;
+
+        t       = a0 * (b[1] & 0xffffffffL);
+        sum     = (t & 0xffffffffL) + carry;
+        carry   = t >>> 32;
+        t       = (a[1] & 0xffffffffL) * b0;
+        sum    += t & 0xffffffffL;
+        carry  += (t >>> 32) + (sum >>> 32);
+        ret[2]  = (int)sum;
+
+        t       = (long)a0 * (long)b0;
+        sum     = (t & 0xffffffffL) + carry;
+        ret[1]  = (int)sum;
+        ret[0]  = (int)((t >>> 32) + (sum >>> 32));
+
+        return ret;
+    }
+
+    public static int[] fromDouble(double v, int[] n)
+    {
+        long bits = Double.doubleToRawLongBits(v);
+
+        int sign = (int)((bits >> 63) & 1);
+        int exp = ((int)(bits >> 52) & 2047) - 1023;
+        int m0 = (int)(bits >> 36) & 0xffff;
+        int m1 = (int)(bits >> 4);
+        int m2 = (int)(bits & 15) << 28;
+
+        n[0] = (sign << 31) | ((exp + 16383) << 16) | m0;
+        n[1] = m1;
+        n[2] = m2;
+        n[3] = 0;
+
+        return n;
+    }
+
+    private static String mantissaToHex(int[] n)
+    {
+        return String.format(n[0] < 0 ? "-%x%08x%08x%08x" : "%x%08x%08x%08x", (n[0] & 0xffff) | 0x10000, n[1], n[2], n[3]);    
+    }
+
+    public static BigDecimal toBigDecimal(int[] n)
+    {
+        final BigDecimal mant = new BigDecimal(new BigInteger(mantissaToHex(n), 16)).divide(MAN_DIV);
+        int exp = getExponent(n);
+        final BigDecimal em = exp < 0 ? BigDecimal.ONE.divide(BD_2.pow(-exp)) : BD_2.pow(exp);
+        return mant.multiply(em);
+    }
+
+    public static String toString(int[] n)
+    {
+        if((n[0] & 0x7fff0000) == 0)
+            return "0";
+        return toBigDecimal(n).toString();
+    }
+
+    public static String toHexString(int[] n)
+    {
+        StringBuilder sb = new StringBuilder();
+        for(int i = 0; i < n.length; i++)
+            sb.append(String.format(i == 0 ? "%08x" : "%08x", n[i]));
+        return sb.toString();
+    }
+}