[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: ATLAS developer release 3.3.1 is out
Clint,
> axpy copy scal nrm2 asum amax dotc
> ===== ===== ===== ===== ===== ===== =====
>
> F77 d Athlon600 16.6 12.1 21.1 17.2 47.4 25.6 24.5
> ATL d Athlon600 26.4 13.2 23.0 61.3 159.6 175.7 44.9
Maybe you recall that I have sent you a mail with an Athlon optimized STREAM some weeks
ago. In the sources you find a vector copy routine (dassign.asm) that copies a vector
with ~920MB/s on my Athlon classic 600/PC133 (I don't know how you calculate MFLOPS in a
copy routine, but I think 920MB/s=58 MFLOPS). It uses MMX/3dnow instructions and bypasses the
caches via movntq. MMX/3dnow! instructions provide better memory performance than FPU instructions
and CAN be used in double precision routines also (take a look on the other routines in my STREAM).
> Along the same lines, I'm already considering adding support for
> atlas_set (set a vector to a constant)
dfill() of my STREAM fills a vector with zeros with amazing 1020MB/s on my machine. It can be
easily modified for all precisions.
As an example for mixing 3dnow! and FPU instructions to achieve better performance - this is the
source code of dscale() (b[]=c*a[])
;PROC: void dscale(double *src,double *dest,double *factor,int *size,)
;(c)2001 Julian Ruhe ruheejih@linux.zrz.tu-berlin.de
section .text progbits alloc exec nowrite align=16
global dscale
dscale:
push ebp
mov ebp,esp
push ecx
push edi
push ebx
push esi
push eax
femms
mov esi,[ebp+8] ;&src->esi
mov ebx,[ebp+12] ;&dest->ebx
mov ecx,[ebp+16] ;&factor->ecx
mov eax,[ebp+20]
mov eax,[eax] ;size->eax
fld qword [ecx]
fst qword [factor]
cmp eax,0
jle near rem
arrange64 test dword esi,63
jz short continue
fld qword [esi]
fmul st1
fstp qword [ebx]
add esi,byte 8
add ebx,byte 8
dec eax
cmp eax,0
jz near cleanup
jmp short arrange64
continue
mov edx,0
mov ecx,24
div ecx ;(eax div ecx)->eax rem(m)->edx
cmp eax,0
jz near rem
mov ecx,eax
mov eax,esi
mov esi,work ;&work->esi
add eax,byte 15*8
add ebx,byte 15*8
add esi,byte 15*8
femms
align 16
main_loop
movq mm0,[eax-15*8]
movq mm1,[eax-14*8]
movq mm2,[eax-13*8]
movq mm0,[eax-12*8]
movq mm1,[eax-11*8]
movq mm2,[eax-10*8]
movq mm0,[eax-9*8]
movq mm1,[eax-8*8]
movq mm2,[eax-7*8]
movq mm0,[eax-6*8]
movq mm1,[eax-5*8]
movq mm2,[eax-4*8]
movq mm0,[eax-3*8]
movq mm1,[eax-2*8]
movq mm2,[eax-1*8]
movq mm0,[eax-0*8]
movq mm1,[eax+1*8]
movq mm2,[eax+2*8]
movq mm0,[eax+3*8]
movq mm1,[eax+4*8]
movq mm2,[eax+5*8]
movq mm0,[eax+6*8]
movq mm1,[eax+7*8]
movq mm2,[eax+8*8]
prefetch [eax+3*64-15*8]
prefetch [eax+4*64-15*8]
prefetch [eax+5*64-15*8]
prefetch [eax+6*64-15*8]
prefetch [eax+7*64-15*8]
prefetch [eax+8*64-15*8]
femms
fld qword [factor]
fld st0
fld st0
fld st0
fld qword [eax-15*8]
fmul st4
fstp qword [esi-15*8]
fld qword [eax-14*8]
fmul st3
fstp qword [esi-14*8]
fld qword [eax-13*8]
fmul st2
fstp qword [esi-13*8]
fld qword [eax-12*8]
fmul st1
fstp qword [esi-12*8]
fld qword [eax-11*8]
fmul st4
fstp qword [esi-11*8]
fld qword [eax-10*8]
fmul st3
fstp qword [esi-10*8]
fld qword [eax-9*8]
fmul st2
fstp qword [esi-9*8]
fld qword [eax-8*8]
fmul st1
fstp qword [esi-8*8]
fld qword [eax-7*8]
fmul st4
fstp qword [esi-7*8]
fld qword [eax-6*8]
fmul st3
fstp qword [esi-6*8]
fld qword [eax-5*8]
fmul st2
fstp qword [esi-5*8]
fld qword [eax-4*8]
fmul st1
fstp qword [esi-4*8]
fld qword [eax-3*8]
fmul st4
fstp qword [esi-3*8]
fld qword [eax-2*8]
fmul st3
fstp qword [esi-2*8]
fld qword [eax-1*8]
fmul st2
fstp qword [esi-1*8]
fld qword [eax-0*8]
fmul st1
fstp qword [esi-0*8]
fld qword [eax+1*8]
fmul st4
fstp qword [esi+1*8]
fld qword [eax+2*8]
fmul st3
fstp qword [esi+2*8]
fld qword [eax+3*8]
fmul st2
fstp qword [esi+3*8]
fld qword [eax+4*8]
fmul st1
fstp qword [esi+4*8]
fld qword [eax+5*8]
fmul st4
fstp qword [esi+5*8]
fld qword [eax+6*8]
fmul st3
fstp qword [esi+6*8]
fld qword [eax+7*8]
fmul st2
fstp qword [esi+7*8]
fld qword [eax+8*8]
fmul st1
fstp qword [esi+8*8]
femms
movq mm0,[esi-15*8]
movq mm1,[esi-14*8]
movq mm2,[esi-13*8]
movntq [ebx-15*8],mm0
movq mm3,[esi-12*8]
movntq [ebx-14*8],mm1
movq mm4,[esi-11*8]
movntq [ebx-13*8],mm2
movq mm5,[esi-10*8]
movntq [ebx-12*8],mm3
movq mm0,[esi-9*8]
movntq [ebx-11*8],mm4
movq mm1,[esi-8*8]
movntq [ebx-10*8],mm5
movq mm2,[esi-7*8]
movntq [ebx-9*8],mm0
movq mm3,[esi-6*8]
movntq [ebx-8*8],mm1
movq mm4,[esi-5*8]
movntq [ebx-7*8],mm2
movq mm5,[esi-4*8]
movntq [ebx-6*8],mm3
movq mm0,[esi-3*8]
movntq [ebx-5*8],mm4
movq mm1,[esi-2*8]
movntq [ebx-4*8],mm5
movq mm2,[esi-1*8]
movntq [ebx-3*8],mm0
movq mm3,[esi-0*8]
movntq [ebx-2*8],mm1
movq mm4,[esi+1*8]
movntq [ebx-1*8],mm2
movq mm5,[esi+2*8]
movntq [ebx-0*8],mm3
movq mm0,[esi+3*8]
movntq [ebx+1*8],mm4
movq mm1,[esi+4*8]
movntq [ebx+2*8],mm5
movq mm2,[esi+5*8]
movntq [ebx+3*8],mm0
movq mm3,[esi+6*8]
movntq [ebx+4*8],mm1
movq mm4,[esi+7*8]
movntq [ebx+5*8],mm2
movq mm5,[esi+8*8]
movntq [ebx+6*8],mm3
movntq [ebx+7*8],mm4
movntq [ebx+8*8],mm5
add eax,24*8
add ebx,24*8
add edi,24*8
dec ecx
jnz near main_loop
rem
cmp edx,0
jz short cleanup
femms
fld qword [factor]
rem_loop
fld qword [eax-15*8]
fmul st1
fstp qword [ebx-15*8]
add eax,byte 8
add ebx,byte 8
dec edx
jnz short rem_loop
cleanup
femms
pop eax
pop esi
pop ebx
pop edi
pop ecx
leave ;mov esp,ebp / pop ebp
ret
section .data progbits alloc noexec write align=16
times 16 dq 0.0
work times 48 dq 0.0
factor dq 1.2
Regards
Julian