Re: [Csnd] scons MSVC report

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate
star

Re: [Csnd] scons MSVC report

Istvan Varga
Victor Lazzarini wrote:
> What's the assembly code then for truncation
>
> i = (int) f;
>
> for double & single precision?
>

Well, compiling this C code with gcc 4.0:

/* -------- start test.c -------- */

#define _ISOC99_SOURCE
#include <math.h>

int f2i_cast(float x)
{
     return (int) x;
}

int d2i_cast(double x)
{
     return (int) x;
}

int f2i_lrint(float x)
{
     return (int) lrintf(x);
}

int d2i_lrint(double x)
{
     return (int) lrint(x);
}

/* -------- end test.c -------- */

For a generic x86 CPU (irrelevant lines filtered out):

Compiler flags: -Wall -O2 -fomit-frame-pointer -S -masm=intel

f2i_cast:
         sub     %esp, 8
         fnstcw  WORD PTR [%esp+6]
         fld     DWORD PTR [%esp+12]
         movzx   %eax, WORD PTR [%esp+6]
         or      %ax, 3072
         mov     WORD PTR [%esp+4], %ax
         fldcw   WORD PTR [%esp+4]
         fistp   DWORD PTR [%esp]
         fldcw   WORD PTR [%esp+6]
         mov     %eax, DWORD PTR [%esp]
         add     %esp, 8
         ret

d2i_cast:
         sub     %esp, 8
         fnstcw  WORD PTR [%esp+6]
         fld     QWORD PTR [%esp+12]
         movzx   %eax, WORD PTR [%esp+6]
         or      %ax, 3072
         mov     WORD PTR [%esp+4], %ax
         fldcw   WORD PTR [%esp+4]
         fistp   DWORD PTR [%esp]
         fldcw   WORD PTR [%esp+6]
         mov     %eax, DWORD PTR [%esp]
         add     %esp, 8
         ret

f2i_lrint:
         sub     %esp, 16
         fld     DWORD PTR [%esp+20]
#APP
         fistpl DWORD PTR [%esp+12]
#NO_APP
         mov     %eax, DWORD PTR [%esp+12]
         add     %esp, 16
         ret

d2i_lrint:
         sub     %esp, 16
         fld     QWORD PTR [%esp+20]
#APP
         fistpl DWORD PTR [%esp+12]
#NO_APP
         mov     %eax, DWORD PTR [%esp+12]
         add     %esp, 16
         ret

Now for Pentium III, using SSE 1 instructions:
Compiler flags: -Wall -O3 -march=pentium3 -fomit-frame-pointer -ffast-math -S -masm=intel

f2i_cast:
         cvttss2si       %eax, DWORD PTR [%esp+4]
         ret

(the other functions do not change significantly)

Now try Pentium 4, with SSE 2:
Compiler flags: -Wall -O3 -march=pentium4 -fomit-frame-pointer -ffast-math -S -masm=intel

f2i_cast:
         cvttss2si       %eax, DWORD PTR [%esp+4]
         ret

d2i_cast:
         cvttsd2si       %eax, QWORD PTR [%esp+4]
         ret

(again, the lrint based functions remain effectively the same)

As you can see, enabling the use of SSE (with -march) can significantly
improve float to integer casts.


-------------------------------------------------------
SF.Net email is sponsored by: GoToMeeting - the easiest way to collaborate
online with coworkers and clients while avoiding the high cost of travel and
communications. There is no equipment to buy and you can meet as often as
you want. Try it free.http://ads.osdn.com/?ad_id=7402&alloc_id=16135&op=click
_______________________________________________
Csound-devel mailing list
[hidden email]
https://lists.sourceforge.net/lists/listinfo/csound-devel
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate
star

Re: [Csnd] scons MSVC report

Istvan Varga
Victor Lazzarini wrote:

> That code is for rounding isn't it? What about a version for truncation,
> which is used a lot.

For truncating floats (not doubles) to 32 bit integers, you can try this
(perhaps it may be improved further, but first tests show that it is faster
than casting - at least with my CPU - even if not inlined):

/* uncomment these for MSVC */
/*
typedef unsigned char uint8_t;
typedef int int32_t;
typedef unsigned int uint32_t;
#define inline __inline
*/

static inline int32_t float2int32(float x)
{
     union {
       float   f;
       int32_t i;
     } u;
     uint32_t  tmp;
     uint8_t   tmp2;
     u.f = x;
     tmp2 = (uint8_t) 158 - (uint8_t) (((int) u.i & 0x7F800000) >> 23);
     if (tmp2 & (uint8_t) 0xE0)
       return (int32_t) 0;
     tmp = (uint32_t) u.i | (uint32_t) 0xFF800000UL;
     tmp = (tmp << 8) >> tmp2;
     return (u.i < (int32_t) 0 ? -((int32_t) tmp) : (int32_t) tmp);
}

Some benchmark results with the following test code:

volatile int32_t foo;

int main(int argc, char **argv)
{
     float tmp = -12345678;
     while (tmp < 12345678) {
       foo = float2int32(tmp);
       tmp += 0.7071;
     }
     return 0;
}

cast, no SSE:             0.996
case, SSE:                0.164
float2int32 (inline):     0.356
float2int32 (not inline): 0.550


Here is the assembly listing of float2int32 generated with GCC for a
Pentium Pro or newer CPU (as uses cmov):

float2int32:
         push    %ebx
         mov     %al, -98
         mov     %ebx, DWORD PTR [%esp+8]
         mov     %edx, %ebx
         shr     %edx, 23
         sub     %al, %dl
         movzx   %edx, %al
         xor     %eax, %eax
         test    %dl, -32
         jne     .L4
         mov     %eax, %ebx
         mov     %cl, %dl
         or      %eax, -8388608
         sal     %eax, 8
         shr     %eax, %cl
         mov     %edx, %eax
         neg     %edx
         inc     %ebx
         cmovle  %eax, %edx
.L4:
         pop     %ebx
         ret


-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
Csound-devel mailing list
[hidden email]
https://lists.sourceforge.net/lists/listinfo/csound-devel
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate
star

Re: [Csnd] scons MSVC report

Istvan Varga
Istvan Varga wrote:

> cast, no SSE:             0.996
> case, SSE:                0.164
> float2int32 (inline):     0.356
> float2int32 (not inline): 0.550

Another test with using lrintf, but with hacks for conversion with truncating:

static inline int32_t float2int32(float x)
{
     return (int32_t) lrintf(x + (x < 0.0f ? 0.5f : -0.5f));
}

inline:          0.277
if not inlined:  0.502
(compare with above)

assembly (probably needs to be converted to MSVC format if you want to use it)
for Pentium Pro or newer:

.LC1:
         .long   1056964608
.LC2:
         .long   -1090519040

float2int32:
         sub     %esp, 16
         fld     DWORD PTR [%esp+20]
         fldz
         fcomip  %st, %st(1)
         fld     DWORD PTR .LC2
         fld     DWORD PTR .LC1
         fcmovbe %st, %st(1)
         fstp    %st(1)
         faddp   %st(1), %st
#APP
         fistpl DWORD PTR [%esp+12]
#NO_APP
         mov     %eax, DWORD PTR [%esp+12]
         add     %esp, 16
         ret


-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
Csound-devel mailing list
[hidden email]
https://lists.sourceforge.net/lists/listinfo/csound-devel
Loading...