[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: ATLAS developer release 3.3.1 is out



Clint,

>                        axpy   copy   scal   nrm2   asum   amax   dotc
>                       =====  =====  =====  =====  =====  =====  =====

> 
> F77 d Athlon600        16.6   12.1   21.1   17.2   47.4   25.6   24.5
> ATL d Athlon600        26.4   13.2   23.0   61.3  159.6  175.7   44.9

Maybe you recall that I have sent you a mail with an Athlon optimized STREAM some weeks
ago. In the sources you find a vector copy routine (dassign.asm) that copies a vector
with ~920MB/s on my Athlon classic 600/PC133 (I don't know how you calculate MFLOPS in a
copy routine, but I think 920MB/s=58 MFLOPS). It uses MMX/3dnow instructions and bypasses the
caches via movntq. MMX/3dnow! instructions provide better memory performance than FPU instructions
and CAN be used in double precision routines also (take a look on the other routines in my STREAM).
  
> Along the same lines, I'm already considering adding support for
> atlas_set (set a vector to a constant)

dfill() of my STREAM fills a vector with zeros with amazing 1020MB/s on my machine. It can be 
easily modified for all precisions.


As an example for mixing 3dnow! and FPU instructions to achieve better performance - this is the 
source code of dscale() (b[]=c*a[])


		;PROC: void dscale(double *src,double *dest,double *factor,int *size,)
		
		;(c)2001 Julian Ruhe ruheejih@linux.zrz.tu-berlin.de
	
		section .text progbits alloc exec nowrite align=16
		global dscale

dscale:		
		push ebp 		
		mov ebp,esp 

		push ecx
		push edi
		push ebx
		push esi
		push eax

		femms

		mov esi,[ebp+8]			;&src->esi
		mov ebx,[ebp+12]		;&dest->ebx
		mov ecx,[ebp+16]		;&factor->ecx		
		mov eax,[ebp+20]	
		mov eax,[eax]			;size->eax
		
		
		fld qword [ecx]
		fst qword [factor]

		cmp eax,0
		jle near rem
		
		
arrange64	test dword esi,63
		jz short continue
		
		fld qword [esi]
		fmul st1
		fstp qword [ebx]
		
		add esi,byte 8
		add ebx,byte 8
		
		dec eax
		cmp eax,0
		jz near cleanup
		
		jmp short arrange64
		
		
continue	
		mov edx,0	
		mov ecx,24
		div ecx				;(eax div ecx)->eax  rem(m)->edx
	
		cmp eax,0
		jz near rem
		
		mov ecx,eax
		mov eax,esi	
		mov esi,work			;&work->esi
		
		add eax,byte 15*8
		add ebx,byte 15*8
		add esi,byte 15*8

		
	
		femms
		
		align 16	
main_loop				
		movq mm0,[eax-15*8]
		movq mm1,[eax-14*8]
		movq mm2,[eax-13*8]
		movq mm0,[eax-12*8]
		movq mm1,[eax-11*8]
		movq mm2,[eax-10*8]
		movq mm0,[eax-9*8]
		movq mm1,[eax-8*8]
		
		movq mm2,[eax-7*8]
		movq mm0,[eax-6*8]
		movq mm1,[eax-5*8]
		movq mm2,[eax-4*8]
		movq mm0,[eax-3*8]
		movq mm1,[eax-2*8]
		movq mm2,[eax-1*8]
		movq mm0,[eax-0*8]
						
		movq mm1,[eax+1*8]
		movq mm2,[eax+2*8]
		movq mm0,[eax+3*8]
		movq mm1,[eax+4*8]
		movq mm2,[eax+5*8]
		movq mm0,[eax+6*8]
		movq mm1,[eax+7*8]
		movq mm2,[eax+8*8]



		prefetch [eax+3*64-15*8]
		prefetch [eax+4*64-15*8]
		prefetch [eax+5*64-15*8]
		prefetch [eax+6*64-15*8]
		prefetch [eax+7*64-15*8]
		prefetch [eax+8*64-15*8]
		
		
				
		femms
		
		
		fld qword [factor]
		fld st0
		fld st0
		fld st0
		

		
		fld qword [eax-15*8]
		fmul st4
		fstp qword [esi-15*8]
		
		fld qword [eax-14*8]
		fmul st3
		fstp qword [esi-14*8]
		
		fld qword [eax-13*8]
		fmul st2
		fstp qword [esi-13*8]
		
		fld qword [eax-12*8]
		fmul st1
		fstp qword [esi-12*8]
		
		fld qword [eax-11*8]
		fmul st4
		fstp qword [esi-11*8]
		
		fld qword [eax-10*8]
		fmul st3
		fstp qword [esi-10*8]
		
		fld qword [eax-9*8]
		fmul st2
		fstp qword [esi-9*8]		
		
		fld qword [eax-8*8]
		fmul st1
		fstp qword [esi-8*8]		
			
		
		
		fld qword [eax-7*8]
		fmul st4
		fstp qword [esi-7*8]
		
		fld qword [eax-6*8]
		fmul st3
		fstp qword [esi-6*8]
		
		fld qword [eax-5*8]
		fmul st2
		fstp qword [esi-5*8]
		
		fld qword [eax-4*8]
		fmul st1
		fstp qword [esi-4*8]
		
		fld qword [eax-3*8]
		fmul st4
		fstp qword [esi-3*8]
		
		fld qword [eax-2*8]
		fmul st3
		fstp qword [esi-2*8]
		
		fld qword [eax-1*8]
		fmul st2
		fstp qword [esi-1*8]		
		
		fld qword [eax-0*8]
		fmul st1
		fstp qword [esi-0*8]		
		
		
		
		fld qword [eax+1*8]
		fmul st4
		fstp qword [esi+1*8]
		
		fld qword [eax+2*8]
		fmul st3
		fstp qword [esi+2*8]
		
		fld qword [eax+3*8]
		fmul st2
		fstp qword [esi+3*8]
		
		fld qword [eax+4*8]
		fmul st1
		fstp qword [esi+4*8]
		
		fld qword [eax+5*8]
		fmul st4
		fstp qword [esi+5*8]
		
		fld qword [eax+6*8]
		fmul st3
		fstp qword [esi+6*8]
		
		fld qword [eax+7*8]
		fmul st2
		fstp qword [esi+7*8]		
		
		fld qword [eax+8*8]
		fmul st1
		fstp qword [esi+8*8]
		
		
		
		femms

		movq mm0,[esi-15*8]
		movq mm1,[esi-14*8]
		movq mm2,[esi-13*8]
		movntq [ebx-15*8],mm0
		movq mm3,[esi-12*8]
		movntq [ebx-14*8],mm1
		movq mm4,[esi-11*8]
		movntq [ebx-13*8],mm2
		movq mm5,[esi-10*8]
		movntq [ebx-12*8],mm3
		movq mm0,[esi-9*8]
		movntq [ebx-11*8],mm4
		movq mm1,[esi-8*8]
		movntq [ebx-10*8],mm5
		movq mm2,[esi-7*8]
		movntq [ebx-9*8],mm0
		movq mm3,[esi-6*8]
		movntq [ebx-8*8],mm1
		movq mm4,[esi-5*8]
		movntq [ebx-7*8],mm2
		movq mm5,[esi-4*8]		
		movntq [ebx-6*8],mm3
		movq mm0,[esi-3*8]
		movntq [ebx-5*8],mm4
		movq mm1,[esi-2*8]
		movntq [ebx-4*8],mm5
		movq mm2,[esi-1*8]
		movntq [ebx-3*8],mm0
		movq mm3,[esi-0*8]
		movntq [ebx-2*8],mm1
		movq mm4,[esi+1*8]
		movntq [ebx-1*8],mm2
		movq mm5,[esi+2*8]
		movntq [ebx-0*8],mm3				
		movq mm0,[esi+3*8]
		movntq [ebx+1*8],mm4
		movq mm1,[esi+4*8]
		movntq [ebx+2*8],mm5
		movq mm2,[esi+5*8]
		movntq [ebx+3*8],mm0
		movq mm3,[esi+6*8]
		movntq [ebx+4*8],mm1
		movq mm4,[esi+7*8]
		movntq [ebx+5*8],mm2
		movq mm5,[esi+8*8]
		movntq [ebx+6*8],mm3	
		movntq [ebx+7*8],mm4
		movntq [ebx+8*8],mm5
		
		add eax,24*8
		add ebx,24*8
		add edi,24*8
				
		dec ecx
		jnz near main_loop

rem
		cmp edx,0
		jz short cleanup
		
		femms
		
		fld qword [factor]
rem_loop
		fld qword [eax-15*8]
		fmul st1
		fstp qword [ebx-15*8]
		
		add eax,byte 8
		add ebx,byte 8
		
		dec edx
		jnz short rem_loop
		
cleanup	
		femms
		
		pop eax
		pop esi
		pop ebx
		pop edi
		pop ecx

		leave				;mov esp,ebp / pop ebp
		ret

		section .data progbits alloc noexec write align=16
		
		times 16 dq 0.0
work		times 48 dq 0.0
		
factor		dq 1.2		


Regards

Julian