#include #include static inline size_t countnl8(uint64_t x) { uint64_t y = x & 0x8080808080808080; uint64_t z = x ^ 0x0a0a0a0a0a0a0a0a; z |= 0x8080808080808080; z -= 0x0101010101010101; z &= 0x8080808080808080; z |= y; z >>= 7; z += z >> 8; z += z >> 16; z += z >> 32; z &= 0xf; return z; } size_t countnl(const uint64_t *p, size_t n) { size_t nl = 0; size_t i = 0; #if 1 for (; i < 4*(n/4); i += 4) { uint64_t z0 = countnl8(p[i + 0]); uint64_t z1 = countnl8(p[i + 1]); uint64_t z2 = countnl8(p[i + 2]); uint64_t z3 = countnl8(p[i + 3]); nl += z0 + z1 + z2 + z3; } #endif for (; i < n; i++) nl += countnl8(p[i]); return nl; }