Moderator: Andres Valverde
Initialized rand Table: 2188 ms
bitScan2: 1094 ms (749927600)
bitScan: 1343 ms (749927600)
bitScan2(0) = -64
bitScan(0) = 0
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/timeb.h>
typedef unsigned long long U64;
#define rand64() (((U64)rand())^(((U64)rand())<<15)^(((U64)rand())<<30)^(((U64)rand())<<45)^(((U64)rand())<<60))
#define TABLE_SIZE 10000000
U64 randtable[TABLE_SIZE];
int getms()
{
struct timeb timebuffer;
ftime(&timebuffer);
return (timebuffer.time * 1000) + timebuffer.millitm;
}
/* de Bruijn */
const int lsz64_tbl[64] =
{
0, 31, 4, 33, 60, 15, 12, 34,
61, 25, 51, 10, 56, 20, 22, 35,
62, 30, 3, 54, 52, 24, 42, 19,
57, 29, 2, 44, 47, 28, 1, 36,
63, 32, 59, 5, 6, 50, 55, 7,
16, 53, 13, 41, 8, 43, 46, 17,
26, 58, 49, 14, 11, 40, 9, 45,
21, 48, 39, 23, 18, 38, 37, 27,
};
int bitScan (const U64 bb)
{
const U64 lsb = (bb & -(long long) bb) - 1;
const unsigned int foldedLSB = ((int) lsb) ^ ((int) (lsb >> 32));
return lsz64_tbl[foldedLSB * 0x78291ACF >> 26];
}
/* Trial (R. Scharnagl, may be improvable) */
/* (returns -64 for zero argument) */
int lsbRS[256] = {
-120, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
int bitScan2(const U64 b)
{
unsigned buf;
int acc = 0;
if ((buf = (unsigned)b) == 0) {
buf = (unsigned)(b >> 32);
acc = 32;
}
if ((unsigned short)buf == 0) {
buf >>= 16;
acc += 16;
}
if ((unsigned char)buf == 0) {
buf >>= 8;
acc += 8;
}
return acc + lsbRS[buf & 0xff];
}
int main(int argc, char *argv[])
{
int i, loop, sum;
volatile int time=getms();
for(i=0;i<TABLE_SIZE;i++)
randtable[i]=rand64() & rand64();
printf("Initialized rand Table: %d ms\n",getms()-time);
time=getms();
sum= 0;
for(loop = 25; --loop >=0; )
for(i=0;i<TABLE_SIZE;i++)
sum += bitScan2(randtable[i]) & 63;
printf("bitScan2: %d ms (%d)\n",getms()-time, sum);
time=getms();
sum= 0;
for(loop = 25; --loop >=0; )
for(i=0;i<TABLE_SIZE;i++)
sum += bitScan(randtable[i]) & 63;
printf("bitScan: %d ms (%d)\n",getms()-time, sum);
printf("bitScan2(0) = %d\n", bitScan2(0));
printf("bitScan(0) = %d\n", bitScan(0));
system("PAUSE");
return 0;
}
Initialized rand Table: 2234 ms
popCount2: 2203 ms (-294859621)
popCount: 4313 ms (-294859621)
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/timeb.h>
typedef unsigned long long U64;
#define rand64() (((U64)rand())^(((U64)rand())<<15)^(((U64)rand())<<30)^(((U64)rand())<<45)^(((U64)rand())<<60))
#define TABLE_SIZE 10000000
U64 randtable[TABLE_SIZE];
int getms()
{
struct timeb timebuffer;
ftime(&timebuffer);
return (timebuffer.time * 1000) + timebuffer.millitm;
}
#define m1 0x5555555555555555ULL
#define m2 0x3333333333333333ULL
int popCount(const U64 b)
{
int n;
const U64 a = b - ((b >> 1) & m1);
const U64 c = (a & m2) + ((a >> 2) & m2);
n = ((int) c) + ((int) (c >> 32));
n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);
n = (n & 0xFFFF) + (n >> 16);
n = (n & 0xFF) + (n >> 8);
return n;
}
/* Trial (R. Scharnagl, second idea) */
/* (endian independent form) */
#define msk1 0xEEEEEEEEUL
#define msk2 0xCCCCCCCCUL
#define msk3 0x88888888UL
#define msk4 0x0F0F0F0FUL
unsigned popCount2(const U64 b)
{
unsigned buf;
unsigned acc;
buf = (unsigned)b;
acc = buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
buf = (unsigned)(b>>32);
acc += buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
acc = (acc & msk4) + ((acc >> 4) & msk4);
acc = (acc & 0xFFFF) + (acc >> 16);
acc = (acc & 0xFF) + (acc >> 8);
return acc;
}
int main(int argc, char *argv[])
{
int i, loop, sum;
volatile int time=getms();
for(i=0;i<TABLE_SIZE;i++)
randtable[i]=rand64() & rand64();
printf("Initialized rand Table: %d ms\n",getms()-time);
time=getms();
sum= 0;
for(loop = 25; --loop >=0; )
for(i=0;i<TABLE_SIZE;i++)
sum += popCount2(randtable[i]);
printf("popCount2: %d ms (%d)\n",getms()-time, sum);
time=getms();
sum= 0;
for(loop = 25; --loop >=0; )
for(i=0;i<TABLE_SIZE;i++)
sum += popCount(randtable[i]);
printf("popCount: %d ms (%d)\n",getms()-time, sum);
system("PAUSE");
return 0;
}
Leen Ammeraal wrote:On my machine (Athlon64-3000) and compiled with VC 6.0, this bitScan2 function
is slightly faster than both an assembly version and a version by
Gerd Isenberg that I have used before, so I think I will use it in the
next version of my engine (Queen).
Thank you very much Reinhard!
Leen Ammeraal
INLINE uint LsbClr(BitBoard &bb) {
BitBoard b = bb & -bb;
assert(bb);
bb ^= b; // clear the first bit too
if (b<<48) return (first_one[b]); // is bit among first 16?
if (b<<32) return (first_one[b>>16]+16); // is bit among first 32?
if (b<<16) return (first_one[b>>32]+32); // is bit among first 48?
return (first_one[b>>48]+48); // must be among last 16 bits
}
Reinhard Scharnagl wrote:Sune,
does it work for zero Bitboards too?
Reinhard.
// loop the rooks
BitBoard b = white_rooks;
while (b) {
square = LsbClr(b);
score += rooks_table[square]; // or whatever
}
By the way, some minor modifications were required for VC 6.0:
- long long should be __int64
/* BitBoard population count */
/* (R. Scharnagl, 2005/Jul/20) */
#define msk1 0xEEEEEEEEUL
#define msk2 0xCCCCCCCCUL
#define msk3 0x88888888UL
#define msk4 0x0F0F0F0FUL
unsigned popCount2(const U64 b)
{
unsigned buf;
unsigned acc;
buf = (unsigned)b;
acc = buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
buf = (unsigned)(b>>32);
acc += buf;
acc -= ((buf &= msk1)>>1);
acc -= ((buf &= msk2)>>2);
acc -= ((buf &= msk3)>>3);
acc = (acc & msk4) + ((acc >> 4) & msk4);
acc = (acc & 0xFFFF) + (acc >> 16);
acc = (acc & 0xFF) + (acc >> 8);
return acc;
}
/* BitBoard LSB number search */
/* (R. Scharnagl, 2005/Jul/20) */
/* (zero board will cause -64) */
int lsbRS[256] = {
-120, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
int bitScan2(const U64 b)
{
unsigned buf;
int acc = 0;
if ((buf = (unsigned)b) == 0) {
buf = (unsigned)(b >> 32);
acc = 32;
}
if ((unsigned short)buf == 0) {
buf >>= 16;
acc += 16;
}
if ((unsigned char)buf == 0) {
buf >>= 8;
acc += 8;
}
return acc + lsbRS[buf & 0xff];
}
Sune Fischer wrote:Leen Ammeraal wrote:On my machine (Athlon64-3000) and compiled with VC 6.0, this bitScan2 function
is slightly faster than both an assembly version and a version by
Gerd Isenberg that I have used before, so I think I will use it in the
next version of my engine (Queen).
Thank you very much Reinhard!
Leen Ammeraal
This may be possible, I know Gerd only optimizes for 64 bit chips now so they could be a tad slower on 32 bit.
I remember seeing lots of variations on the bitscan2, Eugene Nalimov himself has posted something IIRC.
Strangely the fastest for me is the following which takes advantage of two things:
1 - a big nasty 16 KB look up table
2 - there is only 1 bit set
- Code: Select all
INLINE uint LsbClr(BitBoard &bb) {
BitBoard b = bb & -bb;
assert(bb);
bb ^= b; // clear the first bit too
if (b<<48) return (first_one[b]); // is bit among first 16?
if (b<<32) return (first_one[b>>16]+16); // is bit among first 32?
if (b<<16) return (first_one[b>>32]+32); // is bit among first 48?
return (first_one[b>>48]+48); // must be among last 16 bits
}
char LogTable[65536];
void generateLogTable()
{
int i,j;
LogTable[0]=-1;
for(i=1;i<65536;i++)
{
for(j=15;j>=0 && !((1<<j)&i);j--);
LogTable[i]=j;
}
}
int Log2(const U64 x)
{
unsigned int temp,temp2;
if(temp=(x>>32))
{
if(temp2=(temp>>16))
return 48+LogTable[temp2];
return 32+LogTable[temp];
}
if(temp=(x>>16))
return 16+LogTable[temp];
return LogTable[x];
}
#define FirstOne(X) Log2((X) & -(X))
#define LastOne(X) Log2(X)
Pradu wrote:I use something like this and this can work without condition 2 and can also work for MSB if you initialize the table differently:
- Code: Select all
LogTable[65536]
void generateLogTable()
{
int i,j;
LogTable[0]=-1;
for(i=1;i<65536;i++)
{
for(j=15;j>=0 && !((1<<j)&i);j--);
LogTable[i]=j;
}
}
int Log2(U64 x)
{
unsigned int temp,temp2;
if(temp=(x>>32))
{
if(temp2=(temp>>16))
return 48+LogTable[temp2];
return 32+LogTable[temp];
}
else
{
if(temp=(x>>16))
return 16+LogTable[temp];
return LogTable[x];
}
}
#define FirstOne(i) Log2((X) & -(X))
#define LastOne(i) Log2(i)
INLINE uint Msb(BitBoard b) {
uint msb = 0;
uint sh = 32;
do {
if (b>>sh) {
msb += sh;
b >>= sh;
}
sh >>= 1;
} while (sh);
return msb;
}
INLINE uint LsbClr(BitBoard &bb) {
BitBoard b = bb & -bb;
assert(bb);
bb ^= b;
if (b&0x000000000000ffff) return (first_one[b]);
if (b&0x00000000ffff0000) {
unsigned short s = *(((unsigned short *) &b) + 1);
return (first_one[s]+16);
}
if (b&0x0000ffff00000000) {
unsigned short s = *(((unsigned short *) &b) + 2);
return (first_one[s]+32);
}
unsigned short s = *(((unsigned short *) &b) + 3);
return (first_one[s]+48);
}
Reinhard Scharnagl wrote:Hi Leen,
once I had a similiar behaviour beside of this routines. I really simplified some routines and had been sure, that things must speed up. But it slows down. I found the reason in VStudio's optimizer, which reorganisized inline code parts (not those which had been declared as such) by suddenly changing unfolding of function calls.
Reinhard.
Return to Programming and Technical Discussions
Users browsing this forum: No registered users and 43 guests