GCC Inline asm problem

Programming Topics (Computer Chess) and technical aspects as test techniques, book building, program tuning etc

Moderator: Andres Valverde

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 00:53

Pradu,

at my Pentium 4 my last version seems to be 35% faster, don't know when done with customized optimizations.

I never have used bitboards, just rewrote the routine.

Reinhard.
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Re: GCC Inline asm problem

Postby Pradu » 20 Jul 2005, 00:58

Dann Corbit wrote:An important thing to think about is that almost all the time your bitboards will be very sparse. Even a full board with no chessmen removed is only half full of bits.

Your random function creates dense bitboards.

It would be good to experiment with both kinds.


Thanks for the advice Dan, I'll do the test for myself when I get back to the dorm :-).
User avatar
Pradu
 
Posts: 343
Joined: 12 Jan 2005, 19:17
Location: Chandler, Arizona, USA

Re: GCC Inline asm problem

Postby Pradu » 20 Jul 2005, 01:00

Reinhard Scharnagl wrote:Pradu,

at my Pentium 4 my last version seems to be 35% faster, don't know when done with customized optimizations.

I never have used bitboards, just rewrote the routine.

Reinhard.


Thanks for you help Reinhard, I appriciate it.
User avatar
Pradu
 
Posts: 343
Joined: 12 Jan 2005, 19:17
Location: Chandler, Arizona, USA

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 01:03

Well Pradu,

better try the next approch, seems to have double speed:

Code: Select all
/* Trial (R. Scharnagl, second idea) */
/*       (endian independent form)   */

#define msk1 0xEEEEEEEEUL
#define msk2 0xCCCCCCCCUL
#define msk3 0x88888888UL
#define msk4 0x0F0F0F0FUL

int popCount2(const U64 b)
{
  unsigned buf;
  int      acc;

  buf  = (unsigned)b;
  acc  = buf;
  acc -= ((buf &= msk1)>>1);
  acc -= ((buf &= msk2)>>2);
  acc -= ((buf &= msk3)>>3);
  buf  = (unsigned)(b>>32);
  acc += buf;
  acc -= ((buf &= msk1)>>1);
  acc -= ((buf &= msk2)>>2);
  acc -= ((buf &= msk3)>>3);
  acc = (acc & msk4)   + ((acc >> 4) & msk4);
  acc = (acc & 0xFFFF) + (acc >> 16);
  acc = (acc & 0xFF)   + (acc >> 8);
  return acc;
}

Reinhard.
Code: Select all
Initialized rand Table: 1156 ms
popCount2: 2141 ms (-589497692)
popCount: 4328 ms (-589497692)
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 01:25

I get excellent results for RS's versions.
I expect that the parallel bitcount would be faster with a vectorizing compiler like intel's and if given the vectorizer hints.
I expect that some of the 64 bit versions will be faster on a 64 bit OS with a 64 bit compiler...

Try this version:

Code: Select all
#include <assert.h>


typedef unsigned long long bitboard;

extern unsigned int t0_and_masks(bitboard x);
extern unsigned int t1_and_masks(bitboard i);
extern unsigned int t2_and_masks(bitboard b);
extern unsigned int t3_rmlsbsub(bitboard n);
extern unsigned int t4_rmlsbmask(bitboard n);
extern unsigned int t5_testlsb(bitboard n);
extern unsigned int t6_testmsb(bitboard n);
extern unsigned int t7_testsignandshift(bitboard n);
extern unsigned int t8_testeachbit(bitboard n);
extern unsigned int t9_testeachbit1shl(bitboard n);
extern unsigned int tA_tableshift(bitboard n);
extern unsigned int tB_tableuchar(bitboard n);
extern unsigned int tC_tableshiftcast(bitboard n);
extern unsigned int tD_itableshift(bitboard n);
extern unsigned int tE_itableuchar(bitboard n);
extern unsigned int tF_itableshiftcast(bitboard n);
extern unsigned int tG_parallel(bitboard n);
extern unsigned int tH_hamming(bitboard w);
extern unsigned int tI_Scharnagl(bitboard b);

unsigned        t0_and_masks(bitboard x)
{
    assert(x);
    x = (x >> 1 & 0x5555555555555555) + (x & 0x5555555555555555);
    x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
    x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
    x = ((x >> 8) + x) & 0x00ff00ff00ff00ff;
    x = ((x >> 16) + x) & 0x0000ffff0000ffff;
    return (unsigned) ((x + (x >> 32)) & 0xff);
}
unsigned        t1_and_masks(bitboard i)
{
    unsigned        j;
    assert(i);
    i = (i & 0x5555555555555555) + (i >> 1 & 0x5555555555555555);
    i = (i & 0x3333333333333333) + (i >> 2 & 0x3333333333333333);
    i = ((i >> 4) + i) & 0x0f0f0f0f0f0f0f0f;
    j = (unsigned) (i + (i >> 32));
    j += j >> 16;
    return (j + (j >> 8)) & 0xff;
}

/*popCount()
 *a noniterative population count of 1 bits in a quadword
 *
 *@param b - the quadword to be counted
 *@returns the number of 1 bits in b
 */
#define m1 0x5555555555555555ULL
#define m2 0x3333333333333333ULL
unsigned        t2_and_masks(bitboard b)
{
    unsigned        n;
    const bitboard  a = b - ((b >> 1) & m1);
    const bitboard  c = (a & m2) + ((a >> 2) & m2);
    n = ((unsigned) c) + ((unsigned) (c >> 32));
    n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);
    n = (n & 0xFFFF) + (n >> 16);
    n = (n & 0xFF) + (n >> 8);
    return n;
}

/*
 * This function counts the bits in a long long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
unsigned        t3_rmlsbsub(bitboard n)
{
    unsigned        count;
    assert(n);
    for (count = 0; n; n -= (n & -n))
        count++;
    return count;
}

unsigned        t4_rmlsbmask(bitboard n)
{
    unsigned        count;
    assert(n);
    for (count = 0; n; count++)
        n &= (n - 1);           /* take away lsb */
    return count;
}

/*
 * This function counts the bits in a long long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
unsigned        t5_testlsb(bitboard n)
{
    unsigned        count;
    assert(n);
    for (count = 0; n; n >>= 1)
        count += (n & 1);

    return count;
}

/*
 * This function counts the bits in a long long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
unsigned        t6_testmsb(bitboard n)
{
    unsigned        count;
    assert(n);
    for (count = 0; n; n += n)
        if (n & ~(~(bitboard) 0 >> 1))
            count++;
    return count;
}

unsigned        t7_testsignandshift(bitboard n)
{
    unsigned        count;
    assert(n);
    for (count = 0; n; n <<= 1)
        if ((long long) n < 0)
            count++;
    return (count);
}

/*
 * This function counts the bits in a long long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long long.
 */
unsigned        t8_testeachbit(bitboard n)
{
    unsigned        count;
    bitboard        mask;
    assert(n);
    count = 0;
    for (mask = 1; mask; mask += mask)
        if (n & mask)
            count++;
    return count;
}

/*
 * This function counts the bits in a long long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long long?
 * (except for ''sizeof(long long) * CHAR_BITS'' expression)
 */
unsigned        t9_testeachbit1shl(bitboard n)
{
    unsigned        count;
    unsigned        bit;
    assert(n);
    count = 0;
    for (bit = 0; bit < 64; ++bit)
        if (n & ((bitboard) 1 << bit))
            count++;
    return count;
}


static const char bits_in_byte[256] =
{
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

unsigned        tA_tableshift(bitboard n)
{
    assert(n);
    return (bits_in_byte[n & 0xff] + bits_in_byte[(n >> 8) & 0xff] +
            bits_in_byte[(n >> 16) & 0xff] + bits_in_byte[(n >> 24) & 0xff] +
            bits_in_byte[(n >> 32) & 0xff] + bits_in_byte[(n >> 40) & 0xff] +
            bits_in_byte[(n >> 48) & 0xff] + bits_in_byte[n >> 56]
        );
}

unsigned        tB_tableuchar(bitboard n)
{
    unsigned char  *p = (unsigned char *) &n;
    assert(n);
    return (bits_in_byte[p[0]] + bits_in_byte[p[1]] + bits_in_byte[p[2]] + bits_in_byte[p[3]] +
            bits_in_byte[p[4]] + bits_in_byte[p[5]] + bits_in_byte[p[6]] + bits_in_byte[p[7]]
        );
}

unsigned        tC_tableshiftcast(bitboard n)
{
    assert(n);
    return bits_in_byte[(unsigned char) n] +
        bits_in_byte[(unsigned char) (n >> 8)] +
        bits_in_byte[(unsigned char) (n >> 16)] +
        bits_in_byte[(unsigned char) (n >> 24)] +
        bits_in_byte[(unsigned char) (n >> 32)] +
        bits_in_byte[(unsigned char) (n >> 40)] +
        bits_in_byte[(unsigned char) (n >> 48)] +
        bits_in_byte[(unsigned char) (n >> 56)];
}

unsigned        tD_itableshift(bitboard n)
{
    assert(n);
    return (bits_in_byte[n & 0xff] + bits_in_byte[(n >> 8) & 0xff] +
            bits_in_byte[(n >> 16) & 0xff] + bits_in_byte[(n >> 24) & 0xff] +
            bits_in_byte[(n >> 32) & 0xff] + bits_in_byte[(n >> 40) & 0xff] +
            bits_in_byte[(n >> 48) & 0xff] + bits_in_byte[(n >> 56)]
        );
}

unsigned        tE_itableuchar(bitboard n)
{
    unsigned char  *p = (unsigned char *) &n;
    assert(n);
    return (bits_in_byte[p[0]] + bits_in_byte[p[1]] + bits_in_byte[p[2]] + bits_in_byte[p[3]] +
            bits_in_byte[p[4]] + bits_in_byte[p[5]] + bits_in_byte[p[6]] + bits_in_byte[p[7]]
        );
}

unsigned        tF_itableshiftcast(bitboard n)
{
    assert(n);
    return bits_in_byte[(unsigned char) n] +
        bits_in_byte[(unsigned char) (n >> 8)] +
        bits_in_byte[(unsigned char) (n >> 16)] +
        bits_in_byte[(unsigned char) (n >> 24)] +
        bits_in_byte[(unsigned char) (n >> 32)] +
        bits_in_byte[(unsigned char) (n >> 40)] +
        bits_in_byte[(unsigned char) (n >> 48)] +
        bits_in_byte[(unsigned char) (n >> 56)];
}

unsigned        tG_parallel(bitboard n)
{
    assert(n);
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (0)))) + 1))) + (((n) >> ((1 << (0)))) & (0xffffffffffffffff / ((1 << ((1 << (0)))) + 1)));
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (1)))) + 1))) + (((n) >> ((1 << (1)))) & (0xffffffffffffffff / ((1 << ((1 << (1)))) + 1)));
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (2)))) + 1))) + (((n) >> ((1 << (2)))) & (0xffffffffffffffff / ((1 << ((1 << (2)))) + 1)));
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (3)))) + 1))) + (((n) >> ((1 << (3)))) & (0xffffffffffffffff / ((1 << ((1 << (3)))) + 1)));
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (4)))) + 1))) + (((n) >> ((1 << (4)))) & (0xffffffffffffffff / ((1 << ((1 << (4)))) + 1)));
    n = ((n) & (0xffffffffffffffff / ((1 << ((1 << (5)))) + 1))) + (((n) >> ((1 << (5)))) & (0xffffffffffffffff / ((1 << ((1 << (5)))) + 1)));
    return (unsigned) n;
}

unsigned        tH_hamming(bitboard w)
{
    bitboard        res;
    assert(w);
    res = (w & 0x5555555555555555) + ((w >> 1) & 0x5555555555555555);
    res = (res & 0x3333333333333333) + ((res >> 2) & 0x3333333333333333);
    res = (res & 0x0F0F0F0F0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F0F0F0F0F);
    res = (res & 0x00FF00FF00FF00FF) + ((res >> 8) & 0x00FF00FF00FF00FF);
    res = (res & 0x0000FFFF0000FFFF) + ((res >> 16) & 0x0000FFFF0000FFFF);
    return (unsigned) ((res & 0x00000000FFFFFFFF) + ((res >> 32) & 0x00000000FFFFFFFF));
}

/* Trial (R. Scharnagl, first idea, not optimized) */

#define msk1 0xEEEEEEEE
#define msk2 0xCCCCCCCC
#define msk3 0x88888888
#define msk4 0x0F0F0F0F

unsigned        tI_Scharnagl(bitboard b)
{
    unsigned        buf;
    unsigned        acc;

    buf = (unsigned) b;
    acc = buf - ((buf & msk1) >> 1)
        - ((buf & msk2) >> 2)
        - ((buf & msk3) >> 3);
    buf = ((unsigned *) &b)[1]; /* Intel format */
    acc += buf - ((buf & msk1) >> 1)
        - ((buf & msk2) >> 2)
        - ((buf & msk3) >> 3);
    acc = (acc & msk4) + ((acc >> 4) & msk4);
    acc = (acc & 0xFFFF) + (acc >> 16);
    acc = (acc & 0xFF) + (acc >> 8);
    return acc;
}

unsigned tJ_Scharnagl(bitboard b) /* P.S.: might be unsigned */
{
   unsigned buf;
   unsigned acc;

   buf  = (unsigned)b;
   acc  = buf - ((buf & msk1)>>1)
           - ((buf & msk2)>>2)
           - ((buf & msk3)>>3);
   buf  = ((unsigned *)&b)[1];  /* Intel format */
   acc += buf - ((buf & msk1)>>1)
           - ((buf & msk2)>>2)
           - ((buf & msk3)>>3);
   acc = (acc & msk4) + ((acc >> 4) & msk4);
   acc = (acc & 0xFFFF) + (acc >> 16);
   acc = (acc & 0xFF) + (acc >> 8);
   return acc;
}


#ifdef UNIT_TEST

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/timeb.h>

int             getms()
{
    struct timeb    timebuffer;
    ftime(&timebuffer);
    return (timebuffer.time * 1000) + timebuffer.millitm;
}

#define rand64() (((bitboard)rand())^(((bitboard)rand())<<15)^(((bitboard)rand())<<30)^(((bitboard)rand())<<45)^(((bitboard)rand())<<60))
#define TABLE_SIZE 10000000
bitboard        randtable[TABLE_SIZE];

typedef unsigned (*bc_func) (bitboard w);

unsigned        time_it(bc_func b)
{
    int             i,
                    time;
    long            count = 0;
    time = getms();
    for (i = 0; i < TABLE_SIZE; i++)
        count += b(randtable[i]);
    printf(": %d ms\n", getms() - time);
    return count;
}
void randomize_nbits(unsigned nbits)
{
    unsigned i, j;
    int time = getms();
    for (i = 0; i < TABLE_SIZE; i++) {
        randtable[i] = 0;
        for(j = 0; j < nbits; j++)
            randtable[i] |= 1 << (rand() % 64);
    }
    printf("Initialized rand Table for %u bits: %d ms\n", nbits, getms() - time);
}

void runtests(void)
{
    unsigned        result0;
    unsigned        result1;
    unsigned        result2;
    unsigned        result3;
    unsigned        result4;
    unsigned        result5;
    unsigned        result6;
    unsigned        result7;
    unsigned        result8;
    unsigned        result9;
    unsigned        resultA;
    unsigned        resultB;
    unsigned        resultC;
    unsigned        resultD;
    unsigned        resultE;
    unsigned        resultF;
    unsigned        resultG;
    unsigned        resultH;
    unsigned        resultI;

    printf("t0_and_masks");
    result0 = time_it(t0_and_masks);
    printf("t1_and_masks");
    result1 = time_it(t1_and_masks);
    printf("t2_and_masks");
    result2 = time_it(t2_and_masks);
    printf("t3_rmlsbsub");
    result3 = time_it(t3_rmlsbsub);
    printf("t4_rmlsbmask");
    result4 = time_it(t4_rmlsbmask);
    printf("t5_testlsb");
    result5 = time_it(t5_testlsb);
    printf("t6_testmsb");
    result6 = time_it(t6_testmsb);
    printf("t7_testsignandshift");
    result7 = time_it(t7_testsignandshift);
    printf("t8_testeachbit");
    result8 = time_it(t8_testeachbit);
    printf("t9_testeachbit1shl");
    result9 = time_it(t9_testeachbit1shl);
    printf("tA_tableshift");
    resultA = time_it(tA_tableshift);
    printf("tB_tableuchar");
    resultB = time_it(tB_tableuchar);
    printf("tC_tableshiftcast");
    resultC = time_it(tC_tableshiftcast);
    printf("tD_itableshift");
    resultD = time_it(tD_itableshift);
    printf("tE_itableuchar");
    resultE = time_it(tE_itableuchar);
    printf("tF_itableshiftcast");
    resultF = time_it(tF_itableshiftcast);
    printf("tG_parallel");
    resultG = time_it(tG_parallel);
    printf("tH_hamming");
    resultH = time_it(tH_hamming);
    printf("tI_Scharnagl");
    resultI = time_it(tI_Scharnagl);
    printf("tJ_Scharnagl");
    resultI = time_it(tJ_Scharnagl);

}
int             main(void)
{
    unsigned        ut0_and_masks = 0;
    unsigned        ut1_and_masks = 0;
    unsigned        ut2_and_masks = 0;
    unsigned        ut3_rmlsbsub = 0;
    unsigned        ut4_rmlsbmask = 0;
    unsigned        ut5_testlsb = 0;
    unsigned        ut6_testmsb = 0;
    unsigned        ut7_testsignandshift = 0;
    unsigned        ut8_testeachbit = 0;
    unsigned        ut9_testeachbit1shl = 0;
    unsigned        utA_tableshift = 0;
    unsigned        utB_tableuchar = 0;
    unsigned        utC_tableshiftcast = 0;
    unsigned        utD_itableshift = 0;
    unsigned        utE_itableuchar = 0;
    unsigned        utF_itableshiftcast = 0;
    unsigned        utG_parallel = 0;
    unsigned        utH_hamming = 0;
    unsigned        utI_Scharnagl = 0;
    unsigned        utJ_Scharnagl = 0;

    bitboard        b;
    int             i;

    int time = getms();
    for (i = 0; i < TABLE_SIZE; i++)
        randtable[i] = rand64();
    printf("Initialized rand Table: %d ms\n", getms() - time);

    runtests();

    for (i = 1; i < 33; i++)
    {
        randomize_nbits(i);
        runtests();
    }
    for (i = 0; i < 1000000; i++) {
#ifdef DENSE
        b = rand();
        b <<= 32;
        b |= rand();
#else

        b = (1 << (rand() % 32));
        b <<= 32;
        b |= (1 << (rand() % 64));
#endif
        ut0_and_masks = t0_and_masks(b);
        ut1_and_masks = t1_and_masks(b);
        ut2_and_masks = t2_and_masks(b);
        ut3_rmlsbsub = t3_rmlsbsub(b);
        ut4_rmlsbmask = t4_rmlsbmask(b);
        ut5_testlsb = t5_testlsb(b);
        ut6_testmsb = t6_testmsb(b);
        ut7_testsignandshift = t7_testsignandshift(b);
        ut8_testeachbit = t8_testeachbit(b);
        ut9_testeachbit1shl = t9_testeachbit1shl(b);
        utA_tableshift = tA_tableshift(b);
        utB_tableuchar = tB_tableuchar(b);
        utC_tableshiftcast = tC_tableshiftcast(b);
        utD_itableshift = tD_itableshift(b);
        utE_itableuchar = tE_itableuchar(b);
        utF_itableshiftcast = tF_itableshiftcast(b);
        utG_parallel = tG_parallel(b);
        utH_hamming = tH_hamming(b);
        utI_Scharnagl = tI_Scharnagl(b);
        utJ_Scharnagl = tJ_Scharnagl(b);
        if ((ut0_and_masks != ut1_and_masks) ||
            (ut0_and_masks != ut2_and_masks) ||
            (ut0_and_masks != ut3_rmlsbsub) ||
            (ut0_and_masks != ut4_rmlsbmask) ||
            (ut0_and_masks != ut5_testlsb) ||
            (ut0_and_masks != ut6_testmsb) ||
            (ut0_and_masks != ut7_testsignandshift) ||
            (ut0_and_masks != ut8_testeachbit) ||
            (ut0_and_masks != ut9_testeachbit1shl) ||
            (ut0_and_masks != utA_tableshift) ||
            (ut0_and_masks != utB_tableuchar) ||
            (ut0_and_masks != utC_tableshiftcast) ||
            (ut0_and_masks != utD_itableshift) ||
            (ut0_and_masks != utE_itableuchar) ||
            (ut0_and_masks != utF_itableshiftcast) ||
            (ut0_and_masks != utG_parallel) ||
            (ut0_and_masks != utH_hamming) ||
            (ut0_and_masks != utI_Scharnagl) ||
            (ut0_and_masks != utJ_Scharnagl)
            ) {
            printf("0: %u 1: %u\n", ut0_and_masks, ut1_and_masks);
            printf("0: %u 2: %u\n", ut0_and_masks, ut2_and_masks);
            printf("0: %u 3: %u\n", ut0_and_masks, ut3_rmlsbsub);
            printf("0: %u 4: %u\n", ut0_and_masks, ut4_rmlsbmask);
            printf("0: %u 5: %u\n", ut0_and_masks, ut5_testlsb);
            printf("0: %u 6: %u\n", ut0_and_masks, ut6_testmsb);
            printf("0: %u 7: %u\n", ut0_and_masks, ut7_testsignandshift);
            printf("0: %u 8: %u\n", ut0_and_masks, ut8_testeachbit);
            printf("0: %u 9: %u\n", ut0_and_masks, ut9_testeachbit1shl);
            printf("0: %u A: %u\n", ut0_and_masks, utA_tableshift);
            printf("0: %u B: %u\n", ut0_and_masks, utB_tableuchar);
            printf("0: %u C: %u\n", ut0_and_masks, utC_tableshiftcast);
            printf("0: %u D: %u\n", ut0_and_masks, utD_itableshift);
            printf("0: %u E: %u\n", ut0_and_masks, utE_itableuchar);
            printf("0: %u F: %u\n", ut0_and_masks, utF_itableshiftcast);
            printf("0: %u G: %u\n", ut0_and_masks, utG_parallel);
            printf("0: %u H: %u\n", ut0_and_masks, utH_hamming);
            printf("0: %u I: %u\n", ut0_and_masks, utI_Scharnagl);
            printf("0: %u J: %u\n", ut0_and_masks, utJ_Scharnagl);
        }
        b = 0xffffffffffffffff;
        ut0_and_masks = t0_and_masks(b);
        ut1_and_masks = t1_and_masks(b);
        ut2_and_masks = t2_and_masks(b);
        ut3_rmlsbsub = t3_rmlsbsub(b);
        ut4_rmlsbmask = t4_rmlsbmask(b);
        ut5_testlsb = t5_testlsb(b);
        ut6_testmsb = t6_testmsb(b);
        ut7_testsignandshift = t7_testsignandshift(b);
        ut8_testeachbit = t8_testeachbit(b);
        ut9_testeachbit1shl = t9_testeachbit1shl(b);
        utA_tableshift = tA_tableshift(b);
        utB_tableuchar = tB_tableuchar(b);
        utC_tableshiftcast = tC_tableshiftcast(b);
        utD_itableshift = tD_itableshift(b);
        utE_itableuchar = tE_itableuchar(b);
        utF_itableshiftcast = tF_itableshiftcast(b);
        utG_parallel = tG_parallel(b);
        utH_hamming = tH_hamming(b);
        utI_Scharnagl = tI_Scharnagl(b);
        utJ_Scharnagl = tJ_Scharnagl(b);
        if ((ut0_and_masks != ut1_and_masks) ||
            (ut0_and_masks != ut2_and_masks) ||
            (ut0_and_masks != ut3_rmlsbsub) ||
            (ut0_and_masks != ut4_rmlsbmask) ||
            (ut0_and_masks != ut5_testlsb) ||
            (ut0_and_masks != ut6_testmsb) ||
            (ut0_and_masks != ut7_testsignandshift) ||
            (ut0_and_masks != ut8_testeachbit) ||
            (ut0_and_masks != ut9_testeachbit1shl) ||
            (ut0_and_masks != utA_tableshift) ||
            (ut0_and_masks != utB_tableuchar) ||
            (ut0_and_masks != utC_tableshiftcast) ||
            (ut0_and_masks != utD_itableshift) ||
            (ut0_and_masks != utE_itableuchar) ||
            (ut0_and_masks != utF_itableshiftcast) ||
            (ut0_and_masks != utG_parallel) ||
            (ut0_and_masks != utH_hamming) ||
            (ut0_and_masks != utI_Scharnagl) ||
            (ut0_and_masks != utJ_Scharnagl)
            ) {
            printf("0: %u 1: %u\n", ut0_and_masks, ut1_and_masks);
            printf("0: %u 2: %u\n", ut0_and_masks, ut2_and_masks);
            printf("0: %u 3: %u\n", ut0_and_masks, ut3_rmlsbsub);
            printf("0: %u 4: %u\n", ut0_and_masks, ut4_rmlsbmask);
            printf("0: %u 5: %u\n", ut0_and_masks, ut5_testlsb);
            printf("0: %u 6: %u\n", ut0_and_masks, ut6_testmsb);
            printf("0: %u 7: %u\n", ut0_and_masks, ut7_testsignandshift);
            printf("0: %u 8: %u\n", ut0_and_masks, ut8_testeachbit);
            printf("0: %u 9: %u\n", ut0_and_masks, ut9_testeachbit1shl);
            printf("0: %u A: %u\n", ut0_and_masks, utA_tableshift);
            printf("0: %u B: %u\n", ut0_and_masks, utB_tableuchar);
            printf("0: %u C: %u\n", ut0_and_masks, utC_tableshiftcast);
            printf("0: %u D: %u\n", ut0_and_masks, utD_itableshift);
            printf("0: %u E: %u\n", ut0_and_masks, utE_itableuchar);
            printf("0: %u F: %u\n", ut0_and_masks, utF_itableshiftcast);
            printf("0: %u G: %u\n", ut0_and_masks, utG_parallel);
            printf("0: %u H: %u\n", ut0_and_masks, utH_hamming);
            printf("0: %u I: %u\n", ut0_and_masks, utI_Scharnagl);
            printf("0: %u J: %u\n", ut0_and_masks, utJ_Scharnagl);
        }
        b = 1;
        ut0_and_masks = t0_and_masks(b);
        ut1_and_masks = t1_and_masks(b);
        ut2_and_masks = t2_and_masks(b);
        ut3_rmlsbsub = t3_rmlsbsub(b);
        ut4_rmlsbmask = t4_rmlsbmask(b);
        ut5_testlsb = t5_testlsb(b);
        ut6_testmsb = t6_testmsb(b);
        ut7_testsignandshift = t7_testsignandshift(b);
        ut8_testeachbit = t8_testeachbit(b);
        ut9_testeachbit1shl = t9_testeachbit1shl(b);
        utA_tableshift = tA_tableshift(b);
        utB_tableuchar = tB_tableuchar(b);
        utC_tableshiftcast = tC_tableshiftcast(b);
        utD_itableshift = tD_itableshift(b);
        utE_itableuchar = tE_itableuchar(b);
        utF_itableshiftcast = tF_itableshiftcast(b);
        utG_parallel = tG_parallel(b);
        utH_hamming = tH_hamming(b);
        utI_Scharnagl = tI_Scharnagl(b);
        utJ_Scharnagl = tJ_Scharnagl(b);
        if ((ut0_and_masks != ut1_and_masks) ||
            (ut0_and_masks != ut2_and_masks) ||
            (ut0_and_masks != ut3_rmlsbsub) ||
            (ut0_and_masks != ut4_rmlsbmask) ||
            (ut0_and_masks != ut5_testlsb) ||
            (ut0_and_masks != ut6_testmsb) ||
            (ut0_and_masks != ut7_testsignandshift) ||
            (ut0_and_masks != ut8_testeachbit) ||
            (ut0_and_masks != ut9_testeachbit1shl) ||
            (ut0_and_masks != utA_tableshift) ||
            (ut0_and_masks != utB_tableuchar) ||
            (ut0_and_masks != utC_tableshiftcast) ||
            (ut0_and_masks != utD_itableshift) ||
            (ut0_and_masks != utE_itableuchar) ||
            (ut0_and_masks != utF_itableshiftcast) ||
            (ut0_and_masks != utG_parallel) ||
            (ut0_and_masks != utH_hamming) ||
            (ut0_and_masks != utI_Scharnagl) ||
            (ut0_and_masks != utJ_Scharnagl)
            ) {
            printf("0: %u 1: %u\n", ut0_and_masks, ut1_and_masks);
            printf("0: %u 2: %u\n", ut0_and_masks, ut2_and_masks);
            printf("0: %u 3: %u\n", ut0_and_masks, ut3_rmlsbsub);
            printf("0: %u 4: %u\n", ut0_and_masks, ut4_rmlsbmask);
            printf("0: %u 5: %u\n", ut0_and_masks, ut5_testlsb);
            printf("0: %u 6: %u\n", ut0_and_masks, ut6_testmsb);
            printf("0: %u 7: %u\n", ut0_and_masks, ut7_testsignandshift);
            printf("0: %u 8: %u\n", ut0_and_masks, ut8_testeachbit);
            printf("0: %u 9: %u\n", ut0_and_masks, ut9_testeachbit1shl);
            printf("0: %u A: %u\n", ut0_and_masks, utA_tableshift);
            printf("0: %u B: %u\n", ut0_and_masks, utB_tableuchar);
            printf("0: %u C: %u\n", ut0_and_masks, utC_tableshiftcast);
            printf("0: %u D: %u\n", ut0_and_masks, utD_itableshift);
            printf("0: %u E: %u\n", ut0_and_masks, utE_itableuchar);
            printf("0: %u F: %u\n", ut0_and_masks, utF_itableshiftcast);
            printf("0: %u G: %u\n", ut0_and_masks, utG_parallel);
            printf("0: %u H: %u\n", ut0_and_masks, utH_hamming);
            printf("0: %u I: %u\n", ut0_and_masks, utI_Scharnagl);
            printf("0: %u J: %u\n", ut0_and_masks, utJ_Scharnagl);
        }
    }

    return 0;
}
#endif
Dann Corbit
 

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 01:46

Here is my result on a 32 bit machine:

Code: Select all
Initialized rand Table: 500 ms
t0_and_masks: 234 ms
t1_and_masks: 204 ms
t2_and_masks: 140 ms
t3_rmlsbsub: 969 ms
t4_rmlsbmask: 750 ms
t5_testlsb: 1672 ms
t6_testmsb: 8797 ms
t7_testsignandshift: 6860 ms
t8_testeachbit: 8156 ms
t9_testeachbit1shl: 6531 ms
tA_tableshift: 172 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 234 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 1 bits: 141 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 140 ms
t3_rmlsbsub: 125 ms
t4_rmlsbmask: 94 ms
t5_testlsb: 563 ms
t6_testmsb: 4265 ms
t7_testsignandshift: 2657 ms
t8_testeachbit: 5766 ms
t9_testeachbit1shl: 3672 ms
tA_tableshift: 171 ms
tB_tableuchar: 204 ms
tC_tableshiftcast: 171 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 234 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 2 bits: 218 ms
t0_and_masks: 235 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 172 ms
t4_rmlsbmask: 156 ms
t5_testlsb: 703 ms
t6_testmsb: 4844 ms
t7_testsignandshift: 1875 ms
t8_testeachbit: 6735 ms
t9_testeachbit1shl: 4281 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 188 ms
tD_itableshift: 422 ms
tE_itableuchar: 515 ms
tF_itableshiftcast: 188 ms
tG_parallel: 250 ms
tH_hamming: 234 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 3 bits: 1016 ms
t0_and_masks: 1078 ms
t1_and_masks: 328 ms
t2_and_masks: 344 ms
t3_rmlsbsub: 234 ms
t4_rmlsbmask: 203 ms
t5_testlsb: 797 ms
t6_testmsb: 5485 ms
t7_testsignandshift: 1984 ms
t8_testeachbit: 6406 ms
t9_testeachbit1shl: 4469 ms
tA_tableshift: 172 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 187 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 218 ms
tG_parallel: 282 ms
tH_hamming: 250 ms
tI_Scharnagl: 156 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 4 bits: 625 ms
t0_and_masks: 266 ms
t1_and_masks: 187 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 328 ms
t4_rmlsbmask: 266 ms
t5_testlsb: 859 ms
t6_testmsb: 7001 ms
t7_testsignandshift: 3797 ms
t8_testeachbit: 8281 ms
t9_testeachbit1shl: 3906 ms
tA_tableshift: 172 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 5 bits: 547 ms
t0_and_masks: 250 ms
t1_and_masks: 172 ms
t2_and_masks: 171 ms
t3_rmlsbsub: 344 ms
t4_rmlsbmask: 297 ms
t5_testlsb: 906 ms
t6_testmsb: 5735 ms
t7_testsignandshift: 2219 ms
t8_testeachbit: 7062 ms
t9_testeachbit1shl: 4079 ms
tA_tableshift: 218 ms
tB_tableuchar: 313 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 187 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 187 ms
tG_parallel: 266 ms
tH_hamming: 297 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 6 bits: 625 ms
t0_and_masks: 250 ms
t1_and_masks: 187 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 421 ms
t4_rmlsbmask: 344 ms
t5_testlsb: 1594 ms
t6_testmsb: 6797 ms
t7_testsignandshift: 2485 ms
t8_testeachbit: 5468 ms
t9_testeachbit1shl: 4797 ms
tA_tableshift: 172 ms
tB_tableuchar: 219 ms
tC_tableshiftcast: 188 ms
tD_itableshift: 171 ms
tE_itableuchar: 204 ms
tF_itableshiftcast: 187 ms
tG_parallel: 281 ms
tH_hamming: 266 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 297 ms
Initialized rand Table for 7 bits: 1000 ms
t0_and_masks: 265 ms
t1_and_masks: 188 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 485 ms
t4_rmlsbmask: 390 ms
t5_testlsb: 1453 ms
t6_testmsb: 6079 ms
t7_testsignandshift: 2578 ms
t8_testeachbit: 6156 ms
t9_testeachbit1shl: 4375 ms
tA_tableshift: 188 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 266 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 265 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 157 ms
Initialized rand Table for 8 bits: 968 ms
t0_and_masks: 282 ms
t1_and_masks: 156 ms
t2_and_masks: 172 ms
t3_rmlsbsub: 515 ms
t4_rmlsbmask: 422 ms
t5_testlsb: 1000 ms
t6_testmsb: 7376 ms
t7_testsignandshift: 3281 ms
t8_testeachbit: 6047 ms
t9_testeachbit1shl: 4610 ms
tA_tableshift: 203 ms
tB_tableuchar: 328 ms
tC_tableshiftcast: 281 ms
tD_itableshift: 250 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 203 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 9 bits: 953 ms
t0_and_masks: 250 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 563 ms
t4_rmlsbmask: 469 ms
t5_testlsb: 1046 ms
t6_testmsb: 9032 ms
t7_testsignandshift: 3906 ms
t8_testeachbit: 6063 ms
t9_testeachbit1shl: 4859 ms
tA_tableshift: 172 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 219 ms
tD_itableshift: 187 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 188 ms
tG_parallel: 250 ms
tH_hamming: 265 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 453 ms
Initialized rand Table for 10 bits: 1344 ms
t0_and_masks: 250 ms
t1_and_masks: 860 ms
t2_and_masks: 406 ms
t3_rmlsbsub: 797 ms
t4_rmlsbmask: 734 ms
t5_testlsb: 1125 ms
t6_testmsb: 7250 ms
t7_testsignandshift: 3313 ms
t8_testeachbit: 8094 ms
t9_testeachbit1shl: 6016 ms
tA_tableshift: 187 ms
tB_tableuchar: 563 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 187 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 172 ms
tG_parallel: 281 ms
tH_hamming: 250 ms
tI_Scharnagl: 157 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 11 bits: 1172 ms
t0_and_masks: 250 ms
t1_and_masks: 171 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 687 ms
t4_rmlsbmask: 1500 ms
t5_testlsb: 2047 ms
t6_testmsb: 7594 ms
t7_testsignandshift: 3578 ms
t8_testeachbit: 6188 ms
t9_testeachbit1shl: 4844 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 859 ms
tF_itableshiftcast: 157 ms
tG_parallel: 281 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 12 bits: 1250 ms
t0_and_masks: 578 ms
t1_and_masks: 359 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 609 ms
t4_rmlsbmask: 531 ms
t5_testlsb: 1360 ms
t6_testmsb: 7672 ms
t7_testsignandshift: 3344 ms
t8_testeachbit: 6687 ms
t9_testeachbit1shl: 5157 ms
tA_tableshift: 156 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 188 ms
tD_itableshift: 172 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 172 ms
tG_parallel: 234 ms
tH_hamming: 250 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 13 bits: 1344 ms
t0_and_masks: 234 ms
t1_and_masks: 234 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 656 ms
t4_rmlsbmask: 563 ms
t5_testlsb: 1078 ms
t6_testmsb: 6484 ms
t7_testsignandshift: 2922 ms
t8_testeachbit: 6813 ms
t9_testeachbit1shl: 4844 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 188 ms
tD_itableshift: 171 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 234 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 14 bits: 1438 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 703 ms
t4_rmlsbmask: 563 ms
t5_testlsb: 1203 ms
t6_testmsb: 8313 ms
t7_testsignandshift: 3531 ms
t8_testeachbit: 9063 ms
t9_testeachbit1shl: 5016 ms
tA_tableshift: 218 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 625 ms
tD_itableshift: 312 ms
tE_itableuchar: 204 ms
tF_itableshiftcast: 171 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 15 bits: 1719 ms
t0_and_masks: 266 ms
t1_and_masks: 187 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 782 ms
t4_rmlsbmask: 609 ms
t5_testlsb: 1234 ms
t6_testmsb: 7219 ms
t7_testsignandshift: 3078 ms
t8_testeachbit: 6813 ms
t9_testeachbit1shl: 4438 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 235 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 16 bits: 1625 ms
t0_and_masks: 235 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 734 ms
t4_rmlsbmask: 625 ms
t5_testlsb: 1172 ms
t6_testmsb: 7766 ms
t7_testsignandshift: 3313 ms
t8_testeachbit: 8922 ms
t9_testeachbit1shl: 4953 ms
tA_tableshift: 203 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 188 ms
tD_itableshift: 188 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 203 ms
tG_parallel: 266 ms
tH_hamming: 234 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 140 ms
Initialized rand Table for 17 bits: 2000 ms
t0_and_masks: 360 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 797 ms
t4_rmlsbmask: 781 ms
t5_testlsb: 1188 ms
t6_testmsb: 6843 ms
t7_testsignandshift: 3157 ms
t8_testeachbit: 6469 ms
t9_testeachbit1shl: 5328 ms
tA_tableshift: 187 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 234 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 18 bits: 1844 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 797 ms
t4_rmlsbmask: 657 ms
t5_testlsb: 1187 ms
t6_testmsb: 7813 ms
t7_testsignandshift: 3766 ms
t8_testeachbit: 6343 ms
t9_testeachbit1shl: 4813 ms
tA_tableshift: 203 ms
tB_tableuchar: 235 ms
tC_tableshiftcast: 187 ms
tD_itableshift: 188 ms
tE_itableuchar: 250 ms
tF_itableshiftcast: 297 ms
tG_parallel: 375 ms
tH_hamming: 250 ms
tI_Scharnagl: 203 ms
tJ_Scharnagl: 140 ms
Initialized rand Table for 19 bits: 2391 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 922 ms
t4_rmlsbmask: 704 ms
t5_testlsb: 1250 ms
t6_testmsb: 6937 ms
t7_testsignandshift: 5203 ms
t8_testeachbit: 7438 ms
t9_testeachbit1shl: 4641 ms
tA_tableshift: 156 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 156 ms
tE_itableuchar: 204 ms
tF_itableshiftcast: 171 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 20 bits: 2016 ms
t0_and_masks: 234 ms
t1_and_masks: 156 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 843 ms
t4_rmlsbmask: 688 ms
t5_testlsb: 1219 ms
t6_testmsb: 6703 ms
t7_testsignandshift: 3453 ms
t8_testeachbit: 6563 ms
t9_testeachbit1shl: 5906 ms
tA_tableshift: 172 ms
tB_tableuchar: 219 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 172 ms
tG_parallel: 266 ms
tH_hamming: 281 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 21 bits: 2515 ms
t0_and_masks: 235 ms
t1_and_masks: 172 ms
t2_and_masks: 140 ms
t3_rmlsbsub: 953 ms
t4_rmlsbmask: 766 ms
t5_testlsb: 1563 ms
t6_testmsb: 7531 ms
t7_testsignandshift: 3781 ms
t8_testeachbit: 6782 ms
t9_testeachbit1shl: 5000 ms
tA_tableshift: 172 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 187 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 22 bits: 2297 ms
t0_and_masks: 250 ms
t1_and_masks: 171 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 937 ms
t4_rmlsbmask: 766 ms
t5_testlsb: 1875 ms
t6_testmsb: 7297 ms
t7_testsignandshift: 3359 ms
t8_testeachbit: 7048 ms
t9_testeachbit1shl: 4859 ms
tA_tableshift: 156 ms
tB_tableuchar: 375 ms
tC_tableshiftcast: 203 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 203 ms
tG_parallel: 250 ms
tH_hamming: 266 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 23 bits: 2469 ms
t0_and_masks: 266 ms
t1_and_masks: 203 ms
t2_and_masks: 203 ms
t3_rmlsbsub: 969 ms
t4_rmlsbmask: 1125 ms
t5_testlsb: 1406 ms
t6_testmsb: 6953 ms
t7_testsignandshift: 3266 ms
t8_testeachbit: 6110 ms
t9_testeachbit1shl: 4859 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 157 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 24 bits: 2375 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 938 ms
t4_rmlsbmask: 1156 ms
t5_testlsb: 1328 ms
t6_testmsb: 7313 ms
t7_testsignandshift: 3422 ms
t8_testeachbit: 6766 ms
t9_testeachbit1shl: 6406 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 234 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 25 bits: 2500 ms
t0_and_masks: 250 ms
t1_and_masks: 156 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 969 ms
t4_rmlsbmask: 797 ms
t5_testlsb: 1265 ms
t6_testmsb: 6688 ms
t7_testsignandshift: 3235 ms
t8_testeachbit: 6062 ms
t9_testeachbit1shl: 4703 ms
tA_tableshift: 360 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 187 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 187 ms
tG_parallel: 266 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 26 bits: 2781 ms
t0_and_masks: 250 ms
t1_and_masks: 188 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 1110 ms
t4_rmlsbmask: 1015 ms
t5_testlsb: 1422 ms
t6_testmsb: 6969 ms
t7_testsignandshift: 3922 ms
t8_testeachbit: 8141 ms
t9_testeachbit1shl: 6703 ms
tA_tableshift: 188 ms
tB_tableuchar: 406 ms
tC_tableshiftcast: 328 ms
tD_itableshift: 328 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 27 bits: 3172 ms
t0_and_masks: 250 ms
t1_and_masks: 172 ms
t2_and_masks: 140 ms
t3_rmlsbsub: 1078 ms
t4_rmlsbmask: 829 ms
t5_testlsb: 1734 ms
t6_testmsb: 7844 ms
t7_testsignandshift: 3234 ms
t8_testeachbit: 6219 ms
t9_testeachbit1shl: 4704 ms
tA_tableshift: 187 ms
tB_tableuchar: 188 ms
tC_tableshiftcast: 171 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 187 ms
tG_parallel: 250 ms
tH_hamming: 235 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 28 bits: 3844 ms
t0_and_masks: 250 ms
t1_and_masks: 171 ms
t2_and_masks: 250 ms
t3_rmlsbsub: 1188 ms
t4_rmlsbmask: 875 ms
t5_testlsb: 1344 ms
t6_testmsb: 8109 ms
t7_testsignandshift: 4594 ms
t8_testeachbit: 7657 ms
t9_testeachbit1shl: 5265 ms
tA_tableshift: 157 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 187 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 29 bits: 3562 ms
t0_and_masks: 266 ms
t1_and_masks: 437 ms
t2_and_masks: 141 ms
t3_rmlsbsub: 2391 ms
t4_rmlsbmask: 1672 ms
t5_testlsb: 3359 ms
t6_testmsb: 7985 ms
t7_testsignandshift: 3484 ms
t8_testeachbit: 6578 ms
t9_testeachbit1shl: 6860 ms
tA_tableshift: 781 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 375 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 187 ms
tG_parallel: 235 ms
tH_hamming: 765 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 141 ms
Initialized rand Table for 30 bits: 3375 ms
t0_and_masks: 765 ms
t1_and_masks: 422 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 1813 ms
t4_rmlsbmask: 1062 ms
t5_testlsb: 1391 ms
t6_testmsb: 6844 ms
t7_testsignandshift: 3172 ms
t8_testeachbit: 5953 ms
t9_testeachbit1shl: 5047 ms
tA_tableshift: 157 ms
tB_tableuchar: 500 ms
tC_tableshiftcast: 484 ms
tD_itableshift: 172 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 235 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 31 bits: 3063 ms
t0_and_masks: 250 ms
t1_and_masks: 156 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 1063 ms
t4_rmlsbmask: 890 ms
t5_testlsb: 1344 ms
t6_testmsb: 8266 ms
t7_testsignandshift: 6344 ms
t8_testeachbit: 7235 ms
t9_testeachbit1shl: 5125 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 250 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 188 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
Initialized rand Table for 32 bits: 3172 ms
t0_and_masks: 359 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 1329 ms
t4_rmlsbmask: 984 ms
t5_testlsb: 1453 ms
t6_testmsb: 6672 ms
t7_testsignandshift: 3219 ms
t8_testeachbit: 5969 ms
t9_testeachbit1shl: 4547 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 203 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 235 ms
tI_Scharnagl: 140 ms
tJ_Scharnagl: 125 ms
Dann Corbit
 

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 01:53

Hi Dann,

it seems you have not tested my last proposal. There is some speed up.

Could that rewritten routine be helpful?

Reinhard.
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Re: GCC Inline asm problem

Postby Pradu » 20 Jul 2005, 02:05

Reinhard

That last routine was a speed up. Your version of the popCount works better.
User avatar
Pradu
 
Posts: 343
Joined: 12 Jan 2005, 19:17
Location: Chandler, Arizona, USA

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 02:24

The last version is not any faster for me.

Added this:

Code: Select all
/* Trial (R. Scharnagl, second idea) */
/*       (endian independent form)   */

unsigned tK_Scharnagl(bitboard b)
{
  unsigned buf;
  unsigned acc;

  buf  = (unsigned)b;
  acc  = buf;
  acc -= ((buf &= msk1)>>1);
  acc -= ((buf &= msk2)>>2);
  acc -= ((buf &= msk3)>>3);
  buf  = (unsigned)(b>>32);
  acc += buf;
  acc -= ((buf &= msk1)>>1);
  acc -= ((buf &= msk2)>>2);
  acc -= ((buf &= msk3)>>3);
  acc = (acc & msk4)   + ((acc >> 4) & msk4);
  acc = (acc & 0xFFFF) + (acc >> 16);
  acc = (acc & 0xFF)   + (acc >> 8);
  return acc;
}


Got this:

Initialized rand Table: 500 ms
t0_and_masks: 234 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 906 ms
t4_rmlsbmask: 735 ms
t5_testlsb: 1625 ms
t6_testmsb: 7860 ms
t7_testsignandshift: 4266 ms
t8_testeachbit: 7407 ms
t9_testeachbit1shl: 5860 ms
tA_tableshift: 172 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 187 ms
tD_itableshift: 156 ms
tE_itableuchar: 219 ms
tF_itableshiftcast: 172 ms
tG_parallel: 250 ms
tH_hamming: 250 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 156 ms
tK_Scharnagl: 141 ms
Initialized rand Table for 1 bits: 125 ms
t0_and_masks: 312 ms
t1_and_masks: 188 ms
t2_and_masks: 531 ms
t3_rmlsbsub: 141 ms
t4_rmlsbmask: 109 ms
t5_testlsb: 594 ms
t6_testmsb: 4063 ms
t7_testsignandshift: 1719 ms
t8_testeachbit: 5438 ms
t9_testeachbit1shl: 3750 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 172 ms
tD_itableshift: 172 ms
tE_itableuchar: 188 ms
tF_itableshiftcast: 171 ms
tG_parallel: 251 ms
tH_hamming: 250 ms
tI_Scharnagl: 125 ms
tJ_Scharnagl: 125 ms
tK_Scharnagl: 140 ms
Initialized rand Table for 2 bits: 219 ms
t0_and_masks: 250 ms
t1_and_masks: 156 ms
t2_and_masks: 157 ms
t3_rmlsbsub: 187 ms
t4_rmlsbmask: 156 ms
t5_testlsb: 703 ms
t6_testmsb: 5501 ms
t7_testsignandshift: 2766 ms
t8_testeachbit: 6250 ms
t9_testeachbit1shl: 4391 ms
tA_tableshift: 172 ms
tB_tableuchar: 187 ms
tC_tableshiftcast: 204 ms
tD_itableshift: 187 ms
tE_itableuchar: 219 ms
tF_itableshiftcast: 172 ms
tG_parallel: 265 ms
tH_hamming: 250 ms
tI_Scharnagl: 188 ms
tJ_Scharnagl: 141 ms
tK_Scharnagl: 140 ms
Initialized rand Table for 3 bits: 313 ms
t0_and_masks: 250 ms
t1_and_masks: 172 ms
t2_and_masks: 156 ms
t3_rmlsbsub: 234 ms
t4_rmlsbmask: 203 ms
t5_testlsb: 797 ms
t6_testmsb: 5204 ms
t7_testsignandshift: 2000 ms
t8_testeachbit: 5204 ms
t9_testeachbit1shl: 3703 ms
tA_tableshift: 171 ms
tB_tableuchar: 203 ms
tC_tableshiftcast: 187 ms
tD_itableshift: 171 ms
tE_itableuchar: 187 ms
tF_itableshiftcast: 188 ms
tG_parallel: 250 ms
tH_hamming: 235 ms
tI_Scharnagl: 141 ms
tJ_Scharnagl: 141 ms
tK_Scharnagl: 141 ms
Initialized rand Table for 4 bits: 422 ms
t0_and_masks: 250 ms
t1_and_masks: 171 ms
t2_and_masks: 140 ms
t3_rmlsbsub: 282 ms
t4_rmlsbmask: 250 ms
t5_testlsb: 844 ms
t6_testmsb: 5641 ms
t7_testsignandshift
Dann Corbit
 

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 02:29

Dann,

may be it fits better to default optimizing options of Visual Studio, so differences in speed could vanish by well selected optimizing.

P.S.: Have you updated the format of the constants? Does it matter?

Reinhard.
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 02:31

Here is the generated assembly for the three routines
Code: Select all
PUBLIC   @tI_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tI_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tI_Scharnagl@8 PROC NEAR            ; COMDAT

; 292  :     unsigned        buf;
; 293  :     unsigned        acc;
; 294  :
; 295  :     buf = (unsigned) b;
; 296  :     acc = buf - ((buf & msk1) >> 1)
; 297  :         - ((buf & msk2) >> 2)
; 298  :         - ((buf & msk3) >> 3);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   mov   eax, ecx
   sub   eax, edx
   mov   edx, ecx
   shr   edx, 2
   and   edx, 858993459            ; 33333333H
   sub   eax, edx
   shr   ecx, 1
   and   ecx, 2004318071            ; 77777777H
   sub   eax, ecx

; 299  :     buf = ((unsigned *) &b)[1]; /* Intel format */
; 300  :     acc += buf - ((buf & msk1) >> 1)
; 301  :         - ((buf & msk2) >> 2)
; 302  :         - ((buf & msk3) >> 3);

   mov   ecx, DWORD PTR _b$[esp]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   push   esi
   mov   esi, ecx
   sub   esi, edx
   mov   edx, ecx
   shr   edx, 2
   shr   ecx, 1
   and   edx, 858993459            ; 33333333H
   sub   esi, edx
   and   ecx, 2004318071            ; 77777777H
   sub   esi, ecx
   add   eax, esi

; 303  :     acc = (acc & msk4) + ((acc >> 4) & msk4);

   mov   ecx, eax
   shr   ecx, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   ecx, eax

; 304  :     acc = (acc & 0xFFFF) + (acc >> 16);

   mov   edx, ecx
   shr   edx, 16               ; 00000010H
   and   ecx, 65535            ; 0000ffffH
   add   edx, ecx

; 305  :     acc = (acc & 0xFF) + (acc >> 8);

   mov   eax, edx
   and   eax, 255            ; 000000ffH
   shr   edx, 8
   add   eax, edx
   pop   esi

; 306  :     return acc;
; 307  : }

   ret   8
@tI_Scharnagl@8 ENDP
_TEXT   ENDS
PUBLIC   @tJ_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tJ_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tJ_Scharnagl@8 PROC NEAR            ; COMDAT

; 311  :    unsigned buf;
; 312  :    unsigned acc;
; 313  :
; 314  :    buf  = (unsigned)b;
; 315  :    acc  = buf - ((buf & msk1)>>1)
; 316  :            - ((buf & msk2)>>2)
; 317  :            - ((buf & msk3)>>3);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   mov   eax, ecx
   sub   eax, edx
   mov   edx, ecx
   shr   edx, 2
   and   edx, 858993459            ; 33333333H
   sub   eax, edx
   shr   ecx, 1
   and   ecx, 2004318071            ; 77777777H
   sub   eax, ecx

; 318  :    buf  = ((unsigned *)&b)[1];  /* Intel format */
; 319  :    acc += buf - ((buf & msk1)>>1)
; 320  :            - ((buf & msk2)>>2)
; 321  :            - ((buf & msk3)>>3);

   mov   ecx, DWORD PTR _b$[esp]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   push   esi
   mov   esi, ecx
   sub   esi, edx
   mov   edx, ecx
   shr   edx, 2
   shr   ecx, 1
   and   edx, 858993459            ; 33333333H
   sub   esi, edx
   and   ecx, 2004318071            ; 77777777H
   sub   esi, ecx
   add   eax, esi

; 322  :    acc = (acc & msk4) + ((acc >> 4) & msk4);

   mov   ecx, eax
   shr   ecx, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   ecx, eax

; 323  :    acc = (acc & 0xFFFF) + (acc >> 16);

   mov   edx, ecx
   shr   edx, 16               ; 00000010H
   and   ecx, 65535            ; 0000ffffH
   add   edx, ecx

; 324  :    acc = (acc & 0xFF) + (acc >> 8);

   mov   eax, edx
   and   eax, 255            ; 000000ffH
   shr   edx, 8
   add   eax, edx
   pop   esi

; 325  :    return acc;
; 326  : }

   ret   8
@tJ_Scharnagl@8 ENDP
_TEXT   ENDS
PUBLIC   @tK_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tK_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tK_Scharnagl@8 PROC NEAR            ; COMDAT

; 333  :   unsigned buf;
; 334  :   unsigned acc;
; 335  :
; 336  :   buf  = (unsigned)b;
; 337  :   acc  = buf;
; 338  :   acc -= ((buf &= msk1)>>1);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   eax, ecx
   and   eax, -286331154            ; eeeeeeeeH
   mov   edx, eax

; 339  :   acc -= ((buf &= msk2)>>2);

   and   eax, -858993460            ; ccccccccH
   shr   edx, 1
   sub   ecx, edx

; 340  :   acc -= ((buf &= msk3)>>3);

   mov   edx, eax
   shr   eax, 2
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   add   edx, eax

; 341  :   buf  = (unsigned)(b>>32);

   mov   eax, DWORD PTR _b$[esp]
   sub   ecx, edx

; 342  :   acc += buf;

   add   ecx, eax

; 343  :   acc -= ((buf &= msk1)>>1);

   and   eax, -286331154            ; eeeeeeeeH
   mov   edx, eax

; 344  :   acc -= ((buf &= msk2)>>2);

   and   eax, -858993460            ; ccccccccH
   shr   edx, 1
   sub   ecx, edx

; 345  :   acc -= ((buf &= msk3)>>3);

   mov   edx, eax
   shr   eax, 2
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   add   edx, eax
   sub   ecx, edx

; 346  :   acc = (acc & msk4)   + ((acc >> 4) & msk4);

   mov   eax, ecx
   shr   eax, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   eax, ecx

; 347  :   acc = (acc & 0xFFFF) + (acc >> 16);

   mov   ecx, eax
   and   eax, 65535            ; 0000ffffH
   shr   ecx, 16               ; 00000010H
   add   ecx, eax

; 348  :   acc = (acc & 0xFF)   + (acc >> 8);

   mov   eax, ecx
   and   eax, 255            ; 000000ffH
   shr   ecx, 8
   add   eax, ecx

; 349  :   return acc;
; 350  : }

   ret   8
@tK_Scharnagl@8 ENDP
_TEXT   ENDS
Dann Corbit
 

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 02:34

Reinhard Scharnagl wrote:P.S.: Have you updated the format of the constants? Does it matter?


Yes, it makes no difference.
Dann Corbit
 

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 02:48

Dann,

then I have exhausted my idea for that this moment.

I think, other critical routines already have been optimized!?

Reinhard.
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Re: GCC Inline asm problem

Postby Dann Corbit » 20 Jul 2005, 02:51

Got a bit more speed with better compiler settings. This is the generated Assembly.
Code: Select all
PUBLIC   @tI_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tI_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tI_Scharnagl@8 PROC NEAR            ; COMDAT

; 292  :     unsigned        buf;
; 293  :     unsigned        acc;
; 294  :
; 295  :     buf = (unsigned) b;
; 296  :     acc = buf - ((buf & msk1) >> 1)
; 297  :         - ((buf & msk2) >> 2)
; 298  :         - ((buf & msk3) >> 3);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   mov   eax, ecx
   sub   eax, edx
   mov   edx, ecx
   shr   edx, 2
   and   edx, 858993459            ; 33333333H
   sub   eax, edx
   shr   ecx, 1
   and   ecx, 2004318071            ; 77777777H
   sub   eax, ecx

; 299  :     buf = ((unsigned *) &b)[1]; /* Intel format */
; 300  :     acc += buf - ((buf & msk1) >> 1)
; 301  :         - ((buf & msk2) >> 2)
; 302  :         - ((buf & msk3) >> 3);

   mov   ecx, DWORD PTR _b$[esp]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   push   esi
   mov   esi, ecx
   sub   esi, edx
   mov   edx, ecx
   shr   edx, 2
   shr   ecx, 1
   and   edx, 858993459            ; 33333333H
   sub   esi, edx
   and   ecx, 2004318071            ; 77777777H
   sub   esi, ecx
   add   eax, esi

; 303  :     acc = (acc & msk4) + ((acc >> 4) & msk4);

   mov   ecx, eax
   shr   ecx, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   ecx, eax

; 304  :     acc = (acc & 0xFFFF) + (acc >> 16);

   mov   edx, ecx
   shr   edx, 16               ; 00000010H
   and   ecx, 65535            ; 0000ffffH
   add   edx, ecx

; 305  :     acc = (acc & 0xFF) + (acc >> 8);

   mov   eax, edx
   and   eax, 255            ; 000000ffH
   shr   edx, 8
   add   eax, edx
   pop   esi

; 306  :     return acc;
; 307  : }

   ret   8
@tI_Scharnagl@8 ENDP
_TEXT   ENDS
PUBLIC   @tJ_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tJ_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tJ_Scharnagl@8 PROC NEAR            ; COMDAT

; 311  :    unsigned buf;
; 312  :    unsigned acc;
; 313  :
; 314  :    buf  = (unsigned)b;
; 315  :    acc  = buf - ((buf & msk1)>>1)
; 316  :            - ((buf & msk2)>>2)
; 317  :            - ((buf & msk3)>>3);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   mov   eax, ecx
   sub   eax, edx
   mov   edx, ecx
   shr   edx, 2
   and   edx, 858993459            ; 33333333H
   sub   eax, edx
   shr   ecx, 1
   and   ecx, 2004318071            ; 77777777H
   sub   eax, ecx

; 318  :    buf  = ((unsigned *)&b)[1];  /* Intel format */
; 319  :    acc += buf - ((buf & msk1)>>1)
; 320  :            - ((buf & msk2)>>2)
; 321  :            - ((buf & msk3)>>3);

   mov   ecx, DWORD PTR _b$[esp]
   mov   edx, ecx
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   push   esi
   mov   esi, ecx
   sub   esi, edx
   mov   edx, ecx
   shr   edx, 2
   shr   ecx, 1
   and   edx, 858993459            ; 33333333H
   sub   esi, edx
   and   ecx, 2004318071            ; 77777777H
   sub   esi, ecx
   add   eax, esi

; 322  :    acc = (acc & msk4) + ((acc >> 4) & msk4);

   mov   ecx, eax
   shr   ecx, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   ecx, eax

; 323  :    acc = (acc & 0xFFFF) + (acc >> 16);

   mov   edx, ecx
   shr   edx, 16               ; 00000010H
   and   ecx, 65535            ; 0000ffffH
   add   edx, ecx

; 324  :    acc = (acc & 0xFF) + (acc >> 8);

   mov   eax, edx
   and   eax, 255            ; 000000ffH
   shr   edx, 8
   add   eax, edx
   pop   esi

; 325  :    return acc;
; 326  : }

   ret   8
@tJ_Scharnagl@8 ENDP
_TEXT   ENDS
PUBLIC   @tK_Scharnagl@8
; Function compile flags: /Ogty
;   COMDAT @tK_Scharnagl@8
_TEXT   SEGMENT
_b$ = 8                     ; size = 8
@tK_Scharnagl@8 PROC NEAR            ; COMDAT

; 333  :   unsigned buf;
; 334  :   unsigned acc;
; 335  :
; 336  :   buf  = (unsigned)b;
; 337  :   acc  = buf;
; 338  :   acc -= ((buf &= msk1)>>1);

   mov   ecx, DWORD PTR _b$[esp-4]
   mov   eax, ecx
   and   eax, -286331154            ; eeeeeeeeH
   mov   edx, eax

; 339  :   acc -= ((buf &= msk2)>>2);

   and   eax, -858993460            ; ccccccccH
   shr   edx, 1
   sub   ecx, edx

; 340  :   acc -= ((buf &= msk3)>>3);

   mov   edx, eax
   shr   eax, 2
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   add   edx, eax

; 341  :   buf  = (unsigned)(b>>32);

   mov   eax, DWORD PTR _b$[esp]
   sub   ecx, edx

; 342  :   acc += buf;

   add   ecx, eax

; 343  :   acc -= ((buf &= msk1)>>1);

   and   eax, -286331154            ; eeeeeeeeH
   mov   edx, eax

; 344  :   acc -= ((buf &= msk2)>>2);

   and   eax, -858993460            ; ccccccccH
   shr   edx, 1
   sub   ecx, edx

; 345  :   acc -= ((buf &= msk3)>>3);

   mov   edx, eax
   shr   eax, 2
   shr   edx, 3
   and   edx, 286331153            ; 11111111H
   add   edx, eax
   sub   ecx, edx

; 346  :   acc = (acc & msk4)   + ((acc >> 4) & msk4);

   mov   eax, ecx
   shr   eax, 4
   and   eax, 252645135            ; 0f0f0f0fH
   and   ecx, 252645135            ; 0f0f0f0fH
   add   eax, ecx

; 347  :   acc = (acc & 0xFFFF) + (acc >> 16);

   mov   ecx, eax
   and   eax, 65535            ; 0000ffffH
   shr   ecx, 16               ; 00000010H
   add   ecx, eax

; 348  :   acc = (acc & 0xFF)   + (acc >> 8);

   mov   eax, ecx
   and   eax, 255            ; 000000ffH
   shr   ecx, 8
   add   eax, ecx

; 349  :   return acc;
; 350  : }

   ret   8
@tK_Scharnagl@8 ENDP
_TEXT   ENDS
Dann Corbit
 

Re: GCC Inline asm problem

Postby Reinhard Scharnagl » 20 Jul 2005, 02:54

Dann,

thank you for helping me evaluating my proposals.
It is late now here in Germany, so I have to close.

Best regards,

Reinhard.
Reinhard Scharnagl
 
Posts: 608
Joined: 01 Oct 2004, 08:36
Location: Klein-Gerau, Germany

Previous

Return to Programming and Technical Discussions

Who is online

Users browsing this forum: No registered users and 9 guests