-
-
Save pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 to your computer and use it in GitHub Desktop.
// Note, bsf/bsr are used by default. | |
// Enable /arch:AVX2 compilation for better optimizations | |
#if defined(_MSC_VER) && !defined(__clang__) | |
#include <intrin.h> | |
#include <limits.h> | |
#if (defined(__cplusplus) && (__cplusplus >= 202002L)) || \ | |
(defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L)) | |
#include <type_traits> | |
#define CONSTEVAL_ (std::is_constant_evaluated()) | |
#define CONSTEXPR_ constexpr | |
#else | |
#define CONSTEXPR_ | |
#endif | |
static CONSTEXPR_ __forceinline int __builtin_ctz(unsigned x) | |
{ | |
#ifdef CONSTEVAL_ | |
if CONSTEVAL_ | |
{ | |
for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) | |
{ | |
if ((x >> i) & 1) | |
return i; | |
} | |
return sizeof(x) * CHAR_BIT; | |
} | |
#endif | |
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) | |
return (int)_CountTrailingZeros(x); | |
#elif defined(__AVX2__) || defined(__BMI__) | |
return (int)_tzcnt_u32(x); | |
#else | |
unsigned long r; | |
_BitScanForward(&r, x); | |
return (int)r; | |
#endif | |
} | |
static CONSTEXPR_ __forceinline int __builtin_ctzll(unsigned long long x) | |
{ | |
#ifdef CONSTEVAL_ | |
if CONSTEVAL_ | |
{ | |
for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) | |
{ | |
if ((x >> i) & 1) | |
return i; | |
} | |
return sizeof(x) * CHAR_BIT; | |
} | |
#endif | |
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) | |
return (int)_CountTrailingZeros64(x); | |
#elif defined(_WIN64) | |
#if defined(__AVX2__) || defined(__BMI__) | |
return (int)_tzcnt_u64(x); | |
#else | |
unsigned long r; | |
_BitScanForward64(&r, x); | |
return (int)r; | |
#endif | |
#else | |
int l = __builtin_ctz((unsigned)x); | |
int h = __builtin_ctz((unsigned)(x >> 32)) + 32; | |
return !!((unsigned)x) ? l : h; | |
#endif | |
} | |
static CONSTEXPR_ __forceinline int __builtin_ctzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_ctzll(x) : __builtin_ctz((unsigned)x); | |
} | |
static CONSTEXPR_ __forceinline int __builtin_clz(unsigned x) | |
{ | |
#ifdef CONSTEVAL_ | |
if CONSTEVAL_ | |
{ | |
for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) | |
{ | |
if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) | |
return i; | |
} | |
return sizeof(x) * CHAR_BIT; | |
} | |
#endif | |
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) | |
return (int)_CountLeadingZeros(x); | |
#elif defined(__AVX2__) || defined(__LZCNT__) | |
return (int)_lzcnt_u32(x); | |
#else | |
unsigned long r; | |
_BitScanReverse(&r, x); | |
return (int)(r ^ 31); | |
#endif | |
} | |
static CONSTEXPR_ __forceinline int __builtin_clzll(unsigned long long x) | |
{ | |
#ifdef CONSTEVAL_ | |
if CONSTEVAL_ | |
{ | |
for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) | |
{ | |
if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) | |
return i; | |
} | |
return sizeof(x) * CHAR_BIT; | |
} | |
#endif | |
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) | |
return (int)_CountLeadingZeros64(x); | |
#elif defined(_WIN64) | |
#if defined(__AVX2__) || defined(__LZCNT__) | |
return (int)_lzcnt_u64(x); | |
#else | |
unsigned long r; | |
_BitScanReverse64(&r, x); | |
return (int)(r ^ 63); | |
#endif | |
#else | |
int l = __builtin_clz((unsigned)x) + 32; | |
int h = __builtin_clz((unsigned)(x >> 32)); | |
return !!((unsigned)(x >> 32)) ? h : l; | |
#endif | |
} | |
static CONSTEXPR_ __forceinline int __builtin_clzl(unsigned long x) | |
{ | |
return sizeof(x) == 8 ? __builtin_clzll(x) : __builtin_clz((unsigned)x); | |
} | |
#undef CONSTEVAL_ | |
#undef CONSTEXPR_ | |
#endif // defined(_MSC_VER) && !defined(__clang__) |
@pps83 thanks see my updates too, still digging up between targets ; that's funny or not, this simple thing is @$##^&&% to get right ; just for getting a bit_width
, all I want that's a portable bit_width
operator jeez ; soft emulation takes no ROM ; maybe with a small lookup table I could shrink down the number of ops... whatever just want bit_width, can I get that easy... it seems not. ICC have _lzcnt_u64, _lzcnt_u32 and IBM XL __cntlz4, __cntlz8, but they have now a clang front-end so ; clang forever. ```update ARMCC has __rbit() and __clz() however that's a clang fork so... just logging for posterity.
For a soft emulation fallback (according to my CPU benchmarks) the following is the best
player:
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
__mu0_static__
const unsigned char s_table[32] =
{
0, 9, 1, 10, 13, 21, 2, 29
, 11, 14, 16, 18, 22, 25, 3, 30
, 8, 12, 20, 28, 15, 17, 24, 7
, 19, 27, 23, 6, 26, 5, 4, 31
};
const unsigned int d = __mu0_bit_digits_i__();
unsigned int x = __x;
if (x) {
x = x | (x >> 1U);
x = x | (x >> 2U);
x = x | (x >> 4U);
x = x | (x >> 8U);
x = x | (x >> 16U);
return (d - 1) - s_table[((x * 0x07C4ACDD) >> 27U) % d];
}
return d;
}
# if 0
__mu0_static_inline__
int __mu0_cntlz_i__(const unsigned int __x)
{
unsigned int x, d, y;
if (__x) {
x = __x;
d = __mu0_bit_digits_i__();
y = x >> 16U; if (y != 0U) { d = d - 16U; x = y; }
y = x >> 8U; if (y != 0U) { d = d - 8U; x = y; }
y = x >> 4U; if (y != 0U) { d = d - 4U; x = y; }
y = x >> 2U; if (y != 0U) { d = d - 2U; x = y; }
y = x >> 1U;
return __mu0_cast__(int, ((y != 0U) ? d - 2U : d - x));
}
return __mu0_bit_digits_i__();
}
# endif
__mu0_static_inline__
int __mu0_cntlz_ll__(const unsigned long long __x)
{
const unsigned int d = __mu0_bit_digits_i__();
# if MU0_HAVE_C99 || MU0_HAVE_CPP11
return ((__x & 0xFFFFFFFF00000000ULL)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFULL)
);
# else
return ((__x & 0xFFFFFFFF00000000U)
? __mu0_cntlz_i__(__x >> d)
: d + __mu0_cntlz_i__(__x & 0xFFFFFFFFU)
);
# endif
}
__mu0_static_inline__
int __mu0_cntlz_l__(const unsigned long __x)
{
# if defined(__LP64__) || defined(__LP64)
return __mu0_cntlz_ll__(__x);
# else
return __mu0_cntlz_i__(__x);
# endif
}
Hi Pavel,
This is really useful! I'd like to include this as a support file in my C container library.
https://github.com/nickpelling/C_STD
Is there a copyright or licence notice you'd like this published under? I use the MIT license because it's most permissive (I think):
https://github.com/nickpelling/C_STD?tab=MIT-1-ov-file
Hi Pavel,
This is really useful! I'd like to include this as a support file in my C container library. https://github.com/nickpelling/C_STD
Is there a copyright or licence notice you'd like this published under? I use the MIT license because it's most permissive (I think): https://github.com/nickpelling/C_STD?tab=MIT-1-ov-file
OK to take it as is. Hopefully it's useful for users of C_STD.
Note, that the code uses _tzcnt_u32, _tzcnt_u64, _lzcnt_u32 _lzcnt_u64. AFAIK, these are BMI (or BMI2), so, it might be a good idea to use bsf variants instead for generic targets.
Thanks very much, Pavel, that's perfect, you're a star! :-)
Thanks very much, Pavel, that's perfect, you're a star! :-)
Note, I updated it to:
- check for
__AVX2__
builds to use lzcnt/tzcnt vs bsf/bsr - add code to handle arm/arm64
- add guard for clang-cl
The code was tested to verify that it's bit exact with the code that clang/gcc emits for these builtins.
I updated the code to support 32/64 bit builds. It's BMI+ though (doesn't try to use
bsr
)https://godbolt.org/z/G766KEM6j