Moderator: Andres Valverde
Reinhard Scharnagl wrote:Here both routines are again separatedly:
- Code: Select all
/* BitBoard population count */
/* (R. Scharnagl, 2005/Jul/20) */
#define msk1 0xEEEEEEEEUL
#define msk2 0xCCCCCCCCUL
#define msk3 0x88888888UL
#define msk4 0x0F0F0F0FUL
unsigned popCount2(const U64 b)
{
unsigned buf;
unsigned acc;
buf = (unsigned)b;
acc = buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
buf = (unsigned)(b>>32);
acc += buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
acc = (acc & msk4) + ((acc >> 4) & msk4);
acc = (acc & 0xFFFF) + (acc >> 16);
acc = (acc & 0xFF) + (acc >> 8);
return acc;
}
Reinhard.
unsigned int PopCount8bit(uint64 b) {
unsigned int count;
unsigned char * c = (char *) &b;
count = bitcount_tb[*c];
count += bitcount_tb[*(c+1)];
count += bitcount_tb[*(c+2)];
count += bitcount_tb[*(c+3)];
count += bitcount_tb[*(c+4)];
count += bitcount_tb[*(c+5)];
count += bitcount_tb[*(c+6)];
count += bitcount_tb[*(c+7)];
}
Constants::Constants()
{
m_SetBits[0] = 0;
for ( UInt_32 l=1; l<=0xffff; l++ )
m_SetBits[l] = m_SetBits[l>>1] + (int)(l&1);
for ( int i=0; i<0x100; i++ )
{
m_ElementIndizes[i] = -1;
}
for ( int i=0; i<64; i++ )
{
UInt_64 ull = (I64CONST( 1u ) << i) - 1; // all bits < i set
UInt_32 ul = UInt_32(ull ^ (ull>>32)); // gives a sequence of 0s, then 1s and then 0s again
ul ^= 0x01040426; // make sum of bytes unique
ul += ul >> 16; // add bytes
ul += ul >> 8;
ul &= 0xff;
ASSERT( m_ElementIndizes[ul] == -1 ); // verify it is unique
m_ElementIndizes[ul] = iFeld; // save index
}
}
inline int Constants::GetElementIndex( UInt_64* pBitboard ) const
{
ASSERT( *pBitboard != 0 );
UInt_64 ull = *pBitboard - 1;
*pBitboard &= ull; // remove lowest 1-bit from bitboard
ull ^= *pBitboard; // has all bits up to the least significant 1-Bit from *pBitboard set
UInt_32 ul = UInt_32(ull ^ (ull>>32)); // gives a sequence of 0s, then 1s and then 0s again
ul ^= 0x01040426; // make sum of bytes unique
ul += ul >> 16; // add bytes
ASSERT( m_ElementIndizes[(ul + (ul>>8)) & 0xff] >= 0 );
return m_ElementIndizes[(ul + (ul>>8)) & 0xff]; // add bytes and return index
}
inline int Constants::GetElementAndIndex( UInt_64* pBitboard, UInt_64* pElem ) const
{
ASSERT( *pBitboard != 0 );
UInt_64 ull = *pBitboard - 1;
*pElem = *pBitboard & ~ull;
*pBitboard &= ull; // remove lowest 1-bit from bitboard
ull ^= *pBitboard; // has all bits up to the least significant 1-Bit from *pBitboard set
UInt_32 ul = UInt_32(ull ^ (ull>>32)); // gives a sequence of 0s, then 1s and then 0s again
ul ^= 0x01040426; // make sum of bytes unique
ul += ul >> 16; // add bytes
ASSERT( m_ElementIndizes[(ul + (ul>>8)) & 0xff] >= 0 );
return m_ElementIndizes[(ul + (ul>>8)) & 0xff]; // add bytes and return index
}
inline int Constants::Log2( UInt_64 m ) const
{
ASSERT( m != 0 );
m--; // has all bits up to the least significant 1-Bit from *pBitboard set
UInt_32 ul = UInt_32(m ^ (m>>32)); // gives a sequence of 0s, then 1s and then 0s again
ul ^= 0x01040426; // make sum of bytes unique
ul += ul >> 16; // add bytes
ASSERT( m_ElementIndizes[(ul + (ul>>8)) & 0xff] >= 0 && m_ElementIndizes[(ul + (ul>>8)) & 0xff] <= 63 );
return m_ElementIndizes[(ul + (ul>>8)) & 0xff]; // add bytes and return index
}
inline int Constants::CountElements( UInt_64 x ) const
{
UInt_32 ul1 = UInt_32(x);
UInt_32 ul2 = UInt_32(x>>32);
return m_SetBits[ul1&0xffff]
+ m_SetBits[ul1>>16]
+ m_SetBits[ul2&0xffff]
+ m_SetBits[ul2>>16];
}
Leen Ammeraal wrote:Hello Reinhard again,
Here is the assembly routine which does badly in your test program, but
which is the faster one in my chess engine. I think I shamelessly 'borrowed' it from Crafty a long time ago.
Perhaps this routine is especially fast for sparsely populated bitboards,
as the comment below seems to indicate:
FORCEINLINE int popCount(BITBOARD a)
{
// ** This stuff was taken by Dann Corbit directly from Crafty's
// ** VcInline.h include file.
// Because Crafty bitboards are typically sparsely populated, we use a
// streamlined version of the boolean.c algorithm instead of the one in x86.s
#pragma warning(disable : 4035)
__asm {
mov ecx, dword ptr a
xor eax, eax
test ecx, ecx
jz l1
l0:
lea edx, [ecx - 1]
inc eax
and ecx, edx
jnz l0
l1:
mov ecx, dword ptr a + 4
test ecx, ecx
jz l3
l2:
lea edx, [ecx - 1]
inc eax
and ecx, edx
jnz l2
l3:
}
}
Dieter B?r?ner wrote:Leen Ammeraal wrote:Hello Reinhard again,
Here is the assembly routine which does badly in your test program, but
which is the faster one in my chess engine. I think I shamelessly 'borrowed' it from Crafty a long time ago.
Perhaps this routine is especially fast for sparsely populated bitboards,
as the comment below seems to indicate:
FORCEINLINE int popCount(BITBOARD a)
{
// ** This stuff was taken by Dann Corbit directly from Crafty's
// ** VcInline.h include file.
// Because Crafty bitboards are typically sparsely populated, we use a
// streamlined version of the boolean.c algorithm instead of the one in x86.s
#pragma warning(disable : 4035)
__asm {
mov ecx, dword ptr a
xor eax, eax
test ecx, ecx
jz l1
l0:
lea edx, [ecx - 1]
inc eax
and ecx, edx
jnz l0
l1:
mov ecx, dword ptr a + 4
test ecx, ecx
jz l3
l2:
lea edx, [ecx - 1]
inc eax
and ecx, edx
jnz l2
l3:
}
}
Hi Leen and others. I have seen this routine in assembly many times. I even translated it to GCC inline assembly. All my tests have shown, that it is not faster, than the equivilent C implementation. The C implemention does need a little bit of micro optimization, sure. But it will work well under many more environments than the assembly.
See for example http://chessprogramming.org/cccsearch/c ... _id=350039. Click also on view thread.
There are (depending on conditions) better implementations. But using assembly for the above, doesn't make any sense to me.
BTW. Crafty earlier used the "magic bit fiddling" (non iterative) popcount earlier. Also coded in assembly. Again, pure C code is faster, for the same algorithm (at least when one gets aware, that the last rounds of the algorithm can be done with 32 bit words, while Crafty's assembly implementation worked on 32 bit words the whole time). No need to say, that it is much easier, portable, less prone to errors, ... to code it in C.
Walter Faxon's routine (and other routines) will probably do even better in general. Also no need to code them in assembly. One could go on to discuss many very technical details to argue, why the C code will be better. For example for MSVC, inline assembly will give much less choices to the optimizer for efficient register allocation (with Gcc this is a different), especially relevant, when inlining.
I did some sort of careful tests: My conclusion - don't use assembly for popcount - use a maybe slightly mircooptimized C version (on 64-bit environments, you'd probably want to use a different implementation, than on 32 bit environments).
For first_one and last_one routines, things are a bit different. Because there we have specific upcodes in assembly in some environments (for example on x86), that are not generally available in high language. Same may be true for popcount upcodes in some other environments (but certainly not on x86, and I doubt, that they are available on any environments for end users).
Cheers,
Dieter
UCHAR onebits[0xFFFF];
void setOnebits() // To be called during initialization
{ onebits[0] = 0;
for (int i=1; i<0xFFFF; ++i)
onebits[i] = onebits[i >> 1] + (i & 1);
}
inline int popCount(BITBOARD x)
{ UINT u1 = UINT(x), u2 = UINT(x>>32);
return onebits[u1&0xFFFF] + onebits[u1>>16]
+ onebits[u2&0xFFFF] + onebits[u2>>16];
}
Sune Fischer wrote:Hi Leen
This is interesting :)
16 bit tables are fast indeed if they don't kill the cache :)
Have you tried the 8bit version I posted?
Reinhards version seems quite fast in my engine, even for sparsely populated boards.
-S
ps.
shouldn't you declare the onebit array one larger (UCHAR onebits[0x010000]) to include the case where all bits are set?
Leen Ammeraal wrote:Hi Sune,
I remember having tried an 8-bit version of myself a long time ago,
when it seemed wasteful to me to use an array of 2^16 bytes for this
purpose. If I remember well, that version was slower than the
assembly version.
Leen Ammeraal wrote:As for your question about the array length, I think you are making a
mistake. If all 64 bits of a bitboard are set, there are sixteen 1-bits in
the array subscript values, so in that case these subscript values are
2^16 - 1, for which my array 'onebit' is just large enough, right?
Leen
UCHAR onebits[0x10000];
void setOnebits() // used during initialization
{ onebits[0] = 0;
for (int i=1; i<0x10000; ++i)
onebits[i] = onebits[i >> 1] + (i & 1);
}
inline int popCount(BITBOARD x)
{ UINT u1 = UINT(x), u2 = UINT(x>>32);
return onebits[(unsigned short)u1] + onebits[u1>>16]
+ onebits[(unsigned short)u2] + onebits[u2>>16];
}
Eduardo Waghabi wrote:I'm new to the board and I didn't read any rules (I should, sorry) yet , but I assume we can use the code posted here (with acknowledgment), right?
By the way, I also assume UINT should be the compiler's equivalent of (unsigned int), and UINT() is a macro that casts an (unsigned int) cast.
Eduardo Waghabi wrote:I'm new to the board and I didn't read any rules (I should, sorry) yet , but I assume we can use the code posted here (with acknowledgment), right?
Eduardo Waghabi wrote:I'm new to the board and I didn't read any rules (I should, sorry) yet , but I assume we can use the code posted here (with acknowledgment), right?...
inline int popCount(BITBOARD x)
{ UINT u1 = UINT(x), u2 = UINT(x>>32);
return onebits[(unsigned short)u1] + onebits[u1>>16]
+ onebits[(unsigned short)u2] + onebits[u2>>16];
}
return onebits[u1 & 0xFFFF] + onebits[u1>>16]
+ onebits[u2 & 0xFFFF] + onebits[u2>>16];
Initialized rand Table: 1250 ms
Average population = 50%
bitScan2: 906 ms (9999221)
bitScan: 1344 ms (9999221)
Initialized rand Table: 2484 ms
Average population = 25%
bitScan2: 1078 ms (29997211)
bitScan: 1344 ms (29997211)
Initialized rand Table: 2485 ms
Average population = 12%
bitScan2: 1937 ms (69852941)
bitScan: 1360 ms (69852941)
Initialized rand Table: 2485 ms
Average population = 6%
bitScan2: 2953 ms (134895445)
bitScan: 1344 ms (134895445)
Initialized rand Table: 2484 ms
Average population = 3%
bitScan2: 3828 ms (185222843)
bitScan: 1344 ms (185222843)
Zero table
bitScan2(0) = -64
bitScan(0) = 0
Initialized rand Table: 1266 ms
Average population = 50%
popCount2: 2156 ms (320017476)
popCount: 4328 ms (320017476)
Initialized rand Table: 5485 ms
Average population = 25%
popCount2: 2172 ms (160006818)
popCount: 4312 ms (160006818)
Initialized rand Table: 5468 ms
Average population = 12%
popCount2: 2172 ms (80002405)
popCount: 4313 ms (80002405)
Initialized rand Table: 5469 ms
Average population = 6%
popCount2: 2172 ms (40733263)
popCount: 4312 ms (40733263)
Initialized rand Table: 5469 ms
Average population = 3%
popCount2: 2187 ms (20209492)
popCount: 4328 ms (20209492)
Zero table
popCount2(0) = 0
popCount(0) = 0
Return to Programming and Technical Discussions
Users browsing this forum: No registered users and 3 guests