sh1boot · December 18, 2015 05:29 · Jun 11, 2013 · Jun 10, 2013 · Jun 10, 2013 · Jun 10, 2013
diff --git a/gistfile1.c b/gistfile1.c
@@ -45,24 +45,14 @@ uint32_t divfn(uint16_t i, uint16_t j)
      * it's within bounds.  In this case, we'll use the fractional part (k) for
      * linear interpolation, and the integer part for the index.
      */
-#if 1
     t = (uint32_t)j << 8;
-    while (t >= ((uint32_t)256 << TABBITS))
+    while (t >= ((uint32_t)1 << (TABBITS + 8)))
     {
         t >>= 1;
         shift++;
     }
     j = t >> 8;
     k = (uint8_t)t;
-#else
-    k = 0;
-    while (j >= (1 << TABBITS))
-    {
-        k = ((j << 7) & 0x80) | (k >> 1);
-        j >>= 1;
-        shift++;
-    }
-#endif
 
     /* look up (0x80000000 / j) */
     t = table2[j];
@@ -86,14 +76,48 @@ uint32_t divfn(uint16_t i, uint16_t j)
         t += m;
     }
 
-    /* compute i * (0x80000000/j) >> 15 for a 16.16 fixed-point value for i/j.
+    /* compute i * (0x100000000/j) >> 16 for a 16.16 fixed-point value i/j.
      * If we had to shift j right, earlier, then we'll shift the result right
      * as well, because larger should give us a smaller result.
      */
-    if (shift == 0)
-        return ((uint64_t)i * t + 0x80000) >> 16;
+    {
+        uint16_t tmp;
+        uint8_t b[4];
+        uint16_t hi;
+#if 0 /* correct, but ineffective */
+        tmp = ((uint16_t)((uint8_t)i) * (uint8_t)t) >> 8;
+#else
+        tmp = 0x80;
+#endif
+        tmp += (uint16_t)((uint8_t)i) * (uint8_t)(t >> 8);
+        hi = (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)t;
+        if (shift < 1)
+            hi += 0x80;
+        tmp += (uint8_t)hi;
+        tmp >>= 8;
+        tmp += hi >> 8;
 
-    t = (uint64_t)i * t >> 16;
+        tmp += ((uint8_t)i) * (uint8_t)(t >> 16);
+        b[0] = (uint8_t)tmp;
+        tmp >>= 8;
+        tmp += ((uint8_t)i) * (uint8_t)(t >> 24);
+        b[1] = (uint8_t)tmp;
+        b[2] = (uint8_t)(tmp >> 8);
+
+        tmp = b[0] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 8);
+        b[0] = (uint8_t)tmp;
+        tmp >>= 8;
+        tmp += b[1] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 16);
+        b[1] = (uint8_t)tmp;
+        tmp >>= 8;
+        tmp += b[2] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 24);
+        b[2] = (uint8_t)tmp;
+        b[3] = (uint8_t)(tmp >> 8);
+
+        t = b[0] | (uint32_t)b[1] << 8 | (uint32_t)b[2] << 16 | (uint32_t)b[3] << 24;
+    }
+    if (shift == 0)
+        return t;
 
     while (shift-- > 1)
         t >>= 1;
@@ -117,11 +141,11 @@ int main(void)
     printf("\n}\n");
 #endif
 
-    for (j = 1; j < 65536; j += 7)
-        for (i = 0; i < 65536; i += 101)
+    for (j = 1; j < 65536; j += 3)
+        for (i = 0; i < 65536; i += 3)
         {
             uint32_t ref = (uint32_t)(((uint32_t)i << 16) + (j / 2)) / j;
-#if 1
+#if 0
             uint32_t tabent = (0x80000000 + (j / 2)) / j;
             uint32_t test = ((uint64_t)i * tabent + 0x4000) >> 15;
 #else

diff --git a/gistfile1.c b/gistfile1.c
@@ -30,6 +30,22 @@ static const uint32_t table2[256 + 1] __attribute__((progmem)) = {
 0x01111111, 0x010fef01, 0x010ecf57, 0x010db20b, 0x010c9715, 0x010b7e6f, 0x010a6811, 0x010953f4, 0x01084211, 0x01073261, 0x010624dd, 0x0105197f, 0x01041041, 0x0103091b, 0x01020408, 0x01010101,
 0x01000000, 
 };
+#endif/*}}}*/
+
+uint32_t divfn(uint16_t i, uint16_t j)
+{
+    uint32_t t;
+    uint8_t shift = 0;
+    uint8_t k;
+
+    if (j <= 1)
+        return (int32_t)i << 16;
+
+    /* if j is too large to index the table, then we just shift it right until
+     * it's within bounds.  In this case, we'll use the fractional part (k) for
+     * linear interpolation, and the integer part for the index.
+     */
+#if 1
     t = (uint32_t)j << 8;
     while (t >= ((uint32_t)256 << TABBITS))
     {

diff --git a/gistfile1.c b/gistfile1.c
@@ -1,37 +1,52 @@
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 
-uint32_t table[65536];
-
 #define TABBITS 8
 
-uint32_t table2[(1 << TABBITS) + 1];
-
-uint32_t divfn(uint32_t i, uint32_t j, int v)
-{
-    uint32_t t;
-    uint32_t r;
-    uint8_t shift = 0;
-    uint16_t k;
-
-    /* convert j to a 16.8 fixed-point number */
-    j <<= 8;
+#if 0
+extern uint32_t divfn(uint16_t i, uint16_t j);
+#else
 
-    /* if j is too large to index the table, then we just shift it right until
-     * it's within bounds.
-     */
-    while (j >= (256 << TABBITS))
+#if TABBITS != 8
+uint32_t table2[(1 << TABBITS) + 1];
+#else/*{{{*/
+static const uint32_t table2[256 + 1] __attribute__((progmem)) = {
+0xffffffff, 0x00000000, 0x80000000, 0x55555555, 0x40000000, 0x33333333, 0x2aaaaaab, 0x24924925, 0x20000000, 0x1c71c71c, 0x1999999a, 0x1745d174, 0x15555555, 0x13b13b14, 0x12492492, 0x11111111,
+0x10000000, 0x0f0f0f0f, 0x0e38e38e, 0x0d79435e, 0x0ccccccd, 0x0c30c30c, 0x0ba2e8ba, 0x0b21642d, 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, 0x09249249, 0x08d3dcb1, 0x08888889, 0x08421084,
+0x08000000, 0x07c1f07c, 0x07878788, 0x07507507, 0x071c71c7, 0x06eb3e45, 0x06bca1af, 0x06906907, 0x06666666, 0x063e7064, 0x06186186, 0x05f417d0, 0x05d1745d, 0x05b05b06, 0x0590b216, 0x0572620b,
+0x05555555, 0x0539782a, 0x051eb852, 0x05050505, 0x04ec4ec5, 0x04d4873f, 0x04bda12f, 0x04a7904a, 0x04924925, 0x047dc11f, 0x0469ee58, 0x0456c798, 0x04444444, 0x04325c54, 0x04210842, 0x04104104,
+0x04000000, 0x03f03f04, 0x03e0f83e, 0x03d22635, 0x03c3c3c4, 0x03b5cc0f, 0x03a83a84, 0x039b0ad1, 0x038e38e4, 0x0381c0e0, 0x03759f23, 0x0369d037, 0x035e50d8, 0x03531dec, 0x03483483, 0x033d91d3,
+0x03333333, 0x03291620, 0x031f3832, 0x03159722, 0x030c30c3, 0x03030303, 0x02fa0be8, 0x02f14990, 0x02e8ba2f, 0x02e05c0c, 0x02d82d83, 0x02d02d03, 0x02c8590b, 0x02c0b02c, 0x02b93105, 0x02b1da46,
+0x02aaaaab, 0x02a3a0fd, 0x029cbc15, 0x0295fad4, 0x028f5c29, 0x0288df0d, 0x02828283, 0x027c4598, 0x02762762, 0x02702702, 0x026a439f, 0x02647c69, 0x025ed098, 0x02593f6a, 0x0253c825, 0x024e6a17,
+0x02492492, 0x0243f6f0, 0x023ee090, 0x0239e0d6, 0x0234f72c, 0x02302302, 0x022b63cc, 0x0226b902, 0x02222222, 0x021d9ead, 0x02192e2a, 0x0214d021, 0x02108421, 0x020c49ba, 0x02082082, 0x02040810,
+0x02000000, 0x01fc07f0, 0x01f81f82, 0x01f4465a, 0x01f07c1f, 0x01ecc07b, 0x01e9131b, 0x01e573ad, 0x01e1e1e2, 0x01de5d6e, 0x01dae607, 0x01d77b65, 0x01d41d42, 0x01d0cb59, 0x01cd8569, 0x01ca4b30,
+0x01c71c72, 0x01c3f8f0, 0x01c0e070, 0x01bdd2b9, 0x01bacf91, 0x01b7d6c4, 0x01b4e81b, 0x01b20364, 0x01af286c, 0x01ac5702, 0x01a98ef6, 0x01a6d01a, 0x01a41a42, 0x01a16d40, 0x019ec8e9, 0x019c2d15,
+0x0199999a, 0x01970e50, 0x01948b10, 0x01920fb5, 0x018f9c19, 0x018d3019, 0x018acb91, 0x01886e5f, 0x01861862, 0x0183c978, 0x01818182, 0x017f4060, 0x017d05f4, 0x017ad221, 0x0178a4c8, 0x01767dce,
+0x01745d17, 0x01724288, 0x01702e06, 0x016e1f77, 0x016c16c1, 0x016a13cd, 0x01681681, 0x01661ec7, 0x01642c86, 0x01623fa7, 0x01605816, 0x015e75bc, 0x015c9883, 0x015ac057, 0x0158ed23, 0x01571ed4,
+0x01555555, 0x01539095, 0x0151d07f, 0x01501501, 0x014e5e0a, 0x014cab88, 0x014afd6a, 0x0149539e, 0x0147ae14, 0x01460cbc, 0x01446f86, 0x0142d662, 0x01414141, 0x013fb014, 0x013e22cc, 0x013c995a,
+0x013b13b1, 0x013991c3, 0x01381381, 0x013698df, 0x013521d0, 0x0133ae46, 0x01323e35, 0x0130d190, 0x012f684c, 0x012e025c, 0x012c9fb5, 0x012b404b, 0x0129e413, 0x01288b01, 0x0127350c, 0x0125e227,
+0x01249249, 0x01234568, 0x0121fb78, 0x0120b471, 0x011f7048, 0x011e2ef4, 0x011cf06b, 0x011bb4a4, 0x011a7b96, 0x01194538, 0x01181181, 0x0116e069, 0x0115b1e6, 0x011485f1, 0x01135c81, 0x0112358e,
+0x01111111, 0x010fef01, 0x010ecf57, 0x010db20b, 0x010c9715, 0x010b7e6f, 0x010a6811, 0x010953f4, 0x01084211, 0x01073261, 0x010624dd, 0x0105197f, 0x01041041, 0x0103091b, 0x01020408, 0x01010101,
+0x01000000, 
+};
+    t = (uint32_t)j << 8;
+    while (t >= ((uint32_t)256 << TABBITS))
+    {
+        t >>= 1;
+        shift++;
+    }
+    j = t >> 8;
+    k = (uint8_t)t;
+#else
+    k = 0;
+    while (j >= (1 << TABBITS))
     {
+        k = ((j << 7) & 0x80) | (k >> 1);
         j >>= 1;
         shift++;
     }
-    /* If j has been shifted right, we'll use the fractional part for linear
-     * interpolation, and the integer part for the index.
-     */
-    k = j & 255;
-    j >>= 8;
+#endif
 
     /* look up (0x80000000 / j) */
     t = table2[j];
@@ -42,57 +57,64 @@ uint32_t divfn(uint32_t i, uint32_t j, int v)
          * shifted down and interpolated represent an especially linear part of
          * the table.
          */
-        t += (uint64_t)(table2[j + 1] - t) * k >> 8;
+        uint16_t tmp;
+        uint32_t n = table2[j + 1] - t;
+        uint32_t m;
+        tmp = (uint16_t)((uint8_t)(n >> 0)) * k;
+        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 8)) * k;
+        m = (uint8_t)tmp;
+        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 16)) * k;
+        m |= (uint32_t)((uint8_t)tmp) << 8;
+        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 24)) * k;
+        m |= (uint32_t)((uint8_t)tmp) << 16;
+        t += m;
     }
 
-    /* rounding factor to properly balance the right shift.  In assembly this can
-     * be replaced with an addition of the resulting carry bit after the last
-     * right shift.
-     */
-    r = 0x4000 << shift;
-
-    /* debug */
-    if (v)
-        printf("r=%04x, j=%04x, t=%08x t>>s=%08x, (T=%08x), shift=%d\n", r, j, t, t >> shift, table[v], shift);
-
     /* compute i * (0x80000000/j) >> 15 for a 16.16 fixed-point value for i/j.
      * If we had to shift j right, earlier, then we'll shift the result right
      * as well, because larger should give us a smaller result.
      */
-    return ((uint64_t)i * t + r) >> (15 + shift);
+    if (shift == 0)
+        return ((uint64_t)i * t + 0x80000) >> 16;
+
+    t = (uint64_t)i * t >> 16;
+
+    while (shift-- > 1)
+        t >>= 1;
+    t++;
+    return t >> 1;
 }
+#endif
 
 int main(void)
 {
-    int i, j;
+    unsigned long i, j;
     uint64_t err = 0;
 
-    for (j = 1; j < (1 << TABBITS) + 3; j++)
+#if TABBITS != 8
+    printf("static const uint32_t table2[%d + 1] __attribute__((progmem)) = {\n0xffffffff, ", 1 << TABBITS);
+    for (j = 1; j < (1 << TABBITS) + 1; j++)
     {
-        table2[j] = (0x80000000u + (j / 2)) / j;
+        table2[j] = (0x100000000u + (j / 2)) / j;
+        printf("0x%08x,%c", table2[j], ((j & 15) == 15) ? '\n' : ' ');
     }
+    printf("\n}\n");
+#endif
 
-    for (j = 1; j < 65536; j += 15)
-    {
-        table[j] = (0x80000000u + (j / 2)) / j;
-
-        for (i = 0; i < 65536; i += 1)
+    for (j = 1; j < 65536; j += 7)
+        for (i = 0; i < 65536; i += 101)
         {
-            uint32_t ref = (uint32_t)((i << 16) + (j / 2)) / j;
-#if 0
-            uint32_t test = ((uint64_t)i * table[j] + 0x4000) >> 15;
+            uint32_t ref = (uint32_t)(((uint32_t)i << 16) + (j / 2)) / j;
+#if 1
+            uint32_t tabent = (0x80000000 + (j / 2)) / j;
+            uint32_t test = ((uint64_t)i * tabent + 0x4000) >> 15;
 #else
-            uint32_t test = divfn(i, j, 0);
+            uint32_t test = divfn(i, j);
 #endif
             int32_t e = (int32_t)(ref - test);
             if (e > 1)
-            {
-                (void)divfn(i, j, j);
-                printf("%d/%d==%u (0x%x) (not %u (0x%x)) error: %d\n", i, j, ref, ref, test, test, e);
-            }
+                printf("%u/%u==%lu (0x%lx) (not %lu (0x%lx)) error: %ld\n", (unsigned)i, (unsigned)j, ref, ref, test, test, e);
             err += abs(e);
-
         }
-    }
     return 0;
 }
diff --git a/gistfile1.c b/gistfile1.c
@@ -1,3 +1,4 @@
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -15,21 +16,49 @@ uint32_t divfn(uint32_t i, uint32_t j, int v)
     uint8_t shift = 0;
     uint16_t k;
 
-    k = 0;
-    while (j >= (1 << TABBITS))
+    /* convert j to a 16.8 fixed-point number */
+    j <<= 8;
+
+    /* if j is too large to index the table, then we just shift it right until
+     * it's within bounds.
+     */
+    while (j >= (256 << TABBITS))
     {
-        k |= (j & 1) << shift;
         j >>= 1;
         shift++;
     }
+    /* If j has been shifted right, we'll use the fractional part for linear
+     * interpolation, and the integer part for the index.
+     */
+    k = j & 255;
+    j >>= 8;
 
+    /* look up (0x80000000 / j) */
     t = table2[j];
-    t += (uint64_t)(table2[j + 1] - t) * k >> shift;
+    if (k)
+    {
+        /* Linearly interpolate between adjacent table entries.  Note that this
+         * works well because the set of values which are large and need to be
+         * shifted down and interpolated represent an especially linear part of
+         * the table.
+         */
+        t += (uint64_t)(table2[j + 1] - t) * k >> 8;
+    }
+
+    /* rounding factor to properly balance the right shift.  In assembly this can
+     * be replaced with an addition of the resulting carry bit after the last
+     * right shift.
+     */
     r = 0x4000 << shift;
 
+    /* debug */
     if (v)
         printf("r=%04x, j=%04x, t=%08x t>>s=%08x, (T=%08x), shift=%d\n", r, j, t, t >> shift, table[v], shift);
 
+    /* compute i * (0x80000000/j) >> 15 for a 16.16 fixed-point value for i/j.
+     * If we had to shift j right, earlier, then we'll shift the result right
+     * as well, because larger should give us a smaller result.
+     */
     return ((uint64_t)i * t + r) >> (15 + shift);
 }
 

diff --git a/gistfile1.c b/gistfile1.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+uint32_t table[65536];
+
+#define TABBITS 8
+
+uint32_t table2[(1 << TABBITS) + 1];
+
+uint32_t divfn(uint32_t i, uint32_t j, int v)
+{
+    uint32_t t;
+    uint32_t r;
+    uint8_t shift = 0;
+    uint16_t k;
+
+    k = 0;
+    while (j >= (1 << TABBITS))
+    {
+        k |= (j & 1) << shift;
+        j >>= 1;
+        shift++;
+    }
+
+    t = table2[j];
+    t += (uint64_t)(table2[j + 1] - t) * k >> shift;
+    r = 0x4000 << shift;
+
+    if (v)
+        printf("r=%04x, j=%04x, t=%08x t>>s=%08x, (T=%08x), shift=%d\n", r, j, t, t >> shift, table[v], shift);
+
+    return ((uint64_t)i * t + r) >> (15 + shift);
+}
+
+int main(void)
+{
+    int i, j;
+    uint64_t err = 0;
+
+    for (j = 1; j < (1 << TABBITS) + 3; j++)
+    {
+        table2[j] = (0x80000000u + (j / 2)) / j;
+    }
+
+    for (j = 1; j < 65536; j += 15)
+    {
+        table[j] = (0x80000000u + (j / 2)) / j;
+
+        for (i = 0; i < 65536; i += 1)
+        {
+            uint32_t ref = (uint32_t)((i << 16) + (j / 2)) / j;
+#if 0
+            uint32_t test = ((uint64_t)i * table[j] + 0x4000) >> 15;
+#else
+            uint32_t test = divfn(i, j, 0);
+#endif
+            int32_t e = (int32_t)(ref - test);
+            if (e > 1)
+            {
+                (void)divfn(i, j, j);
+                printf("%d/%d==%u (0x%x) (not %u (0x%x)) error: %d\n", i, j, ref, ref, test, test, e);
+            }
+            err += abs(e);
+
+        }
+    }
+    return 0;
+}