# To unbundle, sh this file echo ludpiv.f 1>&2 cat >ludpiv.f <<'End of ludpiv.f' double precision a(1001,1001),x(1001),b(1001),v(1001),mflops double precision emax integer ipvt(1001),output c c Danny Sorensen and Jack Dongarra c Argonne National Laboratory c lda = 1001 output = 6 c open(output,file = 'sore1.out') c rewind (output) c write(output,40) 40 format(' lu decomposition timing') do 200 n = 50,1000,50 call matgen(a,lda,n,b) write(output,50)lda,n 50 format(/' sp size of the arrays',i5,' and order is ',i5/) c t1 = second(t) call dpgefa(a,lda,n,ipvt,v,ierr) t2 = second(t) - t1 call lus(a,lda,n,ipvt,x,b) mflops = n mflops = mflops*(5+mflops*(-3+mflops*4)) mflops = mflops/(t2*6.0d6) emax = 0.0 do 111 i = 1,n emax = dmax1(emax,dabs(x(i))) 111 continue if (ierr .ne. 0) emax = -ierr write(output,120) t2,mflops,emax 120 format(' lux parallel version time = ',e12.3,' mflops = ',f9.4, $ ' check = ',e12.3) c 200 continue stop end subroutine matgen(a,lda,n,b) double precision a(lda,*),b(*) c init = 1325 do 30 j = 1,n do 20 i = 1,n init = mod(3125*init,65536) a(i,j) = (init - 32768.0)/16384.0 c if (i .ne. j) then c a(i,j) = 0 c endif c if (j .gt. 1) a(1,j) = j 20 continue c a(j,j) = 100 30 continue do 35 i = 1,n b(i) = 0.0 35 continue do 50 j = 1,n do 40 i = 1,n b(i) = b(i) + a(i,j) 40 continue 50 continue return end subroutine lus (a,lda,n,ipvt,x,b) integer ipvt(*) double precision a(lda,*), x(*), b(*), xk c c purpose: c solve the linear system ax = b given the lu factorization of a (as c computed in lu). c c additional parameters (not parameters of lu): c c x double precision(n), solution of linear system c c b double precision(n), right-hand-side of linear system c c ---------------------------------------------------------------------- c do 10 k = 1, n x(k) = b(k) 10 continue c c do 40 k = 1, n l = ipvt(k) xk = x(l) x(l) = x(k) x(k) = xk do 30 i = k+1, n x(i) = x(i) - a(i,k)*xk 30 continue 40 continue c do 60 k = n, 1, -1 xk = x(k)*a(k,k) do 50 i = 1, k-1 x(i) = x(i) - a(i,k)*xk 50 continue x(k) = xk 60 continue c return end subroutine lux(a,lda,n,ipvt,v,info) c ** lu.f -- lu decomposition c c integer lda,n,ipvt(n),info integer kpiv(20) double precision a(lda,*),v(1),t c info = 0 iblksz = 7 incount = 1 do 40 j = 1, n, iblksz c c search for pivot c ibksm1 = iblksz - 1 if (n-j-ibksm1 .lt. 8) ibksm1 = n-j icount = 1 do 30 k = j, j + ibksm1 v(k:n) = abs(a(k:n,k)) i = firstmaxoffset(v(k:n)) + k t = dabs(a(i,k)) c write(6,*) ' i,t = ',i,t ipvt(k) = i kpiv(icount) = i - k + icount icount = icount + 1 c c test for zero pivot c if (t .lt. 1.0d-14) then info = k go to 50 endif c c interchange rows c do 20 jj = j,j+ibksm1 t = a(k,jj) a(k,jj) = a(i,jj) a(i,jj) = t 20 continue c c form k-th column of l c kp1 = k + 1 t = 1.0d0/a(k,k) incount = incount + 1 a(kp1:n,k) = t*a(kp1:n,k) a(k,k) = t do 25 jj = kp1,j+ibksm1 t = -a(k,jj) a(kp1:n,jj) = a(kp1:n,jj) + t*a(kp1:n,k) 25 continue 30 continue c c form the reduced submatrix in parallel c if (k .le. n) * call reduce(n-j+1,iblksz,lda,a(j,j),kpiv) c c put l back into standard form c c do 110 ii = k-1,j,-1 c kk = ipvt(ii) c do 120 jj = j,ii-1 c t = a(kk,jj) c a(kk,jj) = a(ii,jj) c a(ii,jj) = t c 120 continue c 110 continue if (k .gt. n) go to 50 40 continue 50 continue return end End of ludpiv.f echo lufac.f 1>&2 cat >lufac.f <<'End of lufac.f' subroutine dpgefa(a,lda,n,ipvt,v,info) c*********************************************************************** c c this routine factors a real n by n matrix a into the product of lower and c upper triangular matrices c c a = lu c c where p is a permutation matrix, l is unit lower triangular, and c u is upper triangular. this is a parallel-vector algorithm. c the algorithm is described in "". c c on entry c c a double precision (lda,n) c contains the matrix to be factored c c lda integer c the leading dimension of the array a c c n integer c the order of the matrix contained in a c c on return c c a an upper triangular matrix and the multipliers which were used c to obtain it. c the factorization can be written a = l*u where c l is the product of permutation and unit lower triangular c matrices and u is upper triangular. c c ipvt integer(n) c an integer vector of pivot indices c c v double precision(n) c a work array c c info integer c = 0 normal completion c = k if u(k,k) .eq. 0.0. this is not an error condition for c for this subroutine, but it does indecate that the c triangular solution routine will fail. this is not a c numerically reliable indication of singularity. one c should use the condition estimator from LINPACK to c obtain reliable estimates of near singularity. c c c Danny Sorensen and Jack Dongarra c Argonne National Laboratory c c integer lda,n,ipvt(n),info integer kpiv(20) double precision a(lda,*),v(n),t c info = 0 iblksz = 7 incount = 1 do 40 j = 1, n, iblksz c c search for pivot c ibksm1 = iblksz - 1 if (n-j-ibksm1 .lt. 8) ibksm1 = n-j icount = 1 do 30 k = j, j + ibksm1 v(k:n) = abs(a(k:n,k)) i = firstmaxoffset(v(k:n)) + k t = dabs(a(i,k)) ipvt(k) = i kpiv(icount) = i - k + icount icount = icount + 1 c c test for zero pivot c if (t .lt. 1.0d-14) then info = k go to 50 endif c c interchange rows c do 20 jj = j,j+ibksm1 t = a(k,jj) a(k,jj) = a(i,jj) a(i,jj) = t 20 continue c c form k-th column of l c kp1 = k + 1 t = 1.0d0/a(k,k) incount = incount + 1 a(kp1:n,k) = t*a(kp1:n,k) a(k,k) = t do 25 jj = kp1,j+ibksm1 t = -a(k,jj) a(kp1:n,jj) = a(kp1:n,jj) + t*a(kp1:n,k) 25 continue 30 continue c c form the reduced submatrix in parallel c if (k .le. n) * call reduce(n-j+1,iblksz,lda,a(j,j),kpiv) c c put l back into standard form c do 110 ii = k-1,j,-1 kk = ipvt(ii) do 120 jj = j,ii-1 t = a(kk,jj) a(kk,jj) = a(ii,jj) a(ii,jj) = t 120 continue 110 continue if (k .gt. n) go to 50 40 continue 50 continue return end End of lufac.f echo makefile 1>&2 cat >makefile <<'End of makefile' FILES = lufac.o ludpiv.o smxpyp8.o prepare.o reduce.o second2.o matvec8.o mmul7.o run : $(FILES) fortran -O -AS -recursive $(FILES) -o run .f.o : ; fortran -O -AS -recursive -c $*.f End of makefile echo matvec8.s 1>&2 cat >matvec8.s <<'End of matvec8.s' | Machine Code Listing of matvec.f | | 0 4 8 12 16 20 | subroutine matvec(m,n,lda,a ,x, y ) | double precision a(lda,1),x(1),y(1) | do 100 j = 1,n | y(1:m) = a(1:m,j)*x(j) | 100 continue | return | end | | Danny Sorensen | Argonne National Laboratory | | .data .bss .text _BTEXT: | Literal Pool .globl _matvec_ _matvec_: |> 0000 link a6,#-40 | grab stack space | LINE 2 movl a0@(8),a2 | put address of lda in a2 | LINE 5 movl a0@(4),a1 | put address of n in a1 | LINE 2 movl a0@(20),a5 | put address of y in a5 movl a0@(12),a4 | put address of a in a4 movl a2@,d2 | put value of lda in d2 | LINE 5 movl a0@,a3 | put value of m in a3 movl a3@,d4 | into d4 to set vec length movl a0@(16),a3 | move address of x into a3 moveq #0,d0 | d0 will point to row blocks of a fmoveld d0,fp1 | put 0 in fp1 moveq #-1,d6 | set vector mask to all ones moveq #1,d5 | set stride to one matvec.label_L1: vmoved fp1,v2 | initialize result to 0 movl a1@,d7 | put value of n in d7 subql #1,d7 | initialize loop counter moveq #1,d3 | d3 indexes the x vector movl d0,d1 | d1 indexes the columns of a matvec.label_LE: | LINE 6 fmoved a3@(-8:b)[d3:l:d],fp0 | get jth component xj of x into fp0 vmuadd fp0,a4@(0:b)[d1:l:d],v2,v2 | mult col of a by xj and add to result | LINE 7 addl d5,d3 | increment x index | LINE 6 addl d2,d1 | increment column index subql #1,d7 | decrement loop counter | LINE 9 bges matvec.label_LE |> 006A 6CCC vmoved v2,a5@(0:b)[d0:l:d] | write result to y moveq #32,d7 addl d7,d0 vcnt32 matvec.label_L1 unlk a6 |> 006C 4E5E rts |> 006E 4E75 End of matvec8.s echo mmul7.s 1>&2 cat >mmul7.s <<'End of mmul7.s' | Machine Code Listing of mmul.f | | 0 4 8 12 16 20 24 | subroutine mmul(k,n,iblksz,lda,a, x, y ) | double precision a(lda,1),x(lda,1),y(lda,1) | do 100 j = 1,n | do 50 i = 1,iblksz | y(1:k,j) = y(1:k,j) - a(1:k,i)*x(i,j) | 50 continue | 100 continue | return | end | | Danny Sorensen | Argonne National Laboratory | | .data .bss .text _BTEXT: | Literal Pool .globl _mmul_ _mmul_: |> 0000 link a6,#-40 | grab stack space | LINE 2 movl a0@(12),a2 | put address of lda in a2 | LINE 5 movl a0@(4),a1 | put address of n in a1 | LINE 2 movl a0@(24),a5 | put address of y in a5 movl a0@(16),a4 | put address of a in a4 movl a2@,d2 | put value of lda in d2 | LINE 5 movl a0@,a3 | put value of k movl a3@,d4 | into d4 to set vec length movl a0@(20),a3 | move address of x into a3 moveq #0,d0 | d0 will point to row blocks of a moveq #-1,d6 | set vector mask to all ones moveq #1,d5 | set stride to one mmul.label_L1: movl a1@,d7 | put value of n in d7 subql #1,d7 | initialize loop counter movl d0,d1 | d1 indexes the columns of a vmoved a4@(0:b)[d1:l:d],v0 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v1 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v2 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v3 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v4 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v5 addl d2,d1 | increment column index vmoved a4@(0:b)[d1:l:d],v6 | load columns of a into v0-v3 movl d0,d1 | d1 indexes the columns of y movl #0,d3 | initialize column index for x mmul.label_LE: | LINE 6 vmoved a5@(0:b)[d1:l:d],v7 | read column of y fnegd a3@(0:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v0,v7,v7 | multiply column of a by x and add to y fnegd a3@(8:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v1,v7,v7 | multiply column of a by x and add to y fnegd a3@(16:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v2,v7,v7 | multiply column of a by x and add to y fnegd a3@(24:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v3,v7,v7 | multiply column of a by x and add to y fnegd a3@(32:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v4,v7,v7 | multiply column of a by x and add to y fnegd a3@(40:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v5,v7,v7 | multiply column of a by x and add to y fnegd a3@(48:b)[d3:l:d],fp0 | move negative of x component into fp0 vmuadd fp0,v6,v7,v7 | multiply column of a by x and add to y | get jth column xj of x into fp0,fp1,fp2,fp3 | mult col of a by xj and add to result vmoved v7,a5@(0:b)[d1:l:d] | write result to y addl d2,d1 | increment column index for y addl d2,d3 | increment column index for x subql #1,d7 | decrement loop counter | LINE 9 bges mmul.label_LE |> 006A 6CCC moveq #32,d7 addl d7,d0 vcnt32 mmul.label_L1 unlk a6 |> 006C 4E5E rts |> 006E 4E75 End of mmul7.s echo prepare.f 1>&2 cat >prepare.f <<'End of prepare.f' subroutine prepar(n,iblksz,ldl,l,x,kpiv) c******************************************************************* c c this routine is called by dpgefa. it prepares for block elimination c by performing pivoting and multiplying through by c the appropriate lower triangular matrix. c c on entry c c n integer c the order of the matrix c c Danny Sorensen c Argonne National Laboratory c CVD$G NOCONCUR c integer ldl,iblksz,n,kpiv(*) double precision l(ldl,*),x(ldl,*) double precision t c write(6,*) ' ldl n ',ldl,n do 40 k = 1,iblksz c c interchange rows c i = kpiv(k) c write(6,*) ' i k ',i,k do 20 j = 1,n t = x(k,j) x(k,j) = x(i,j) x(i,j) = t 20 continue 40 continue do 41 k = 1,iblksz c c modify the columns of x c kp1 = k + 1 do 25 j = 1,n t = -x(k,j) do 24 i = kp1,iblksz x(i,j) = x(i,j) + t*l(i,k) 24 continue 25 continue 41 continue return end End of prepare.f echo reduce.f 1>&2 cat >reduce.f <<'End of reduce.f' subroutine reduce(n,iblksz,lda,a,kpiv) c c parallel version for the alliant c c c Danny Sorensen c Argonne National Laboratory c integer lda,n,kpiv(*) double precision a(lda,*) integer k,nproc,nn(9),jcol(9) nproc = 8 k = n - iblksz c c doall loop c lseg = k/nproc lrem = k - lseg*nproc jcol(1) = iblksz + 1 ksum = 0 do 30 j = 1,nproc inc = 0 if (lrem .gt. 0) inc = 1 nn(j) = lseg + inc jcol(j+1) = jcol(j) + nn(j) lrem = lrem - 1 ksum = ksum + nn(j) c write(6,*) ' k, j, nn(j) ,ksum, ',k,j,nn(j),ksum 30 continue CVD$G CNCALL do 35 j = 1,nproc call prepar(nn(j),iblksz,lda,a(1,1),a(1,jcol(j)),kpiv) 35 continue do 300 j = 1,nproc call mmul(k,nn(j),iblksz,lda,a(jcol(1),1),a(1,jcol(j)), * a(jcol(1),jcol(j))) 300 continue c c c return end End of reduce.f echo second2.f 1>&2 cat >second2.f <<'End of second2.f' real function second(t) real t real t1(2) t = etime(t1) t = t1(1) second = t return end End of second2.f echo smxpyp8.f 1>&2 cat >smxpyp8.f <<'End of smxpyp8.f' subroutine smxpyp (m,y,n,lda,x,a) c c parallel version for the alliant c c c Danny Sorensen c Argonne National Laboratory c integer lda,m,n double precision y(m),x(n),a(lda,n) double precision yy(1000,8) integer nproc,nn(9),jcol(9) nproc = 8 c c original loop c if (n .lt. nproc .or. m .lt. nproc) then do 20 j = 1, n do 10 i = 1, m y(i) = y(i) + x(j)*a(i,j) 10 continue 20 continue else c c doall loop c lseg = n/nproc lrem = n - lseg*nproc jcol(1) = 1 do 30 j = 1,nproc inc = 0 if (lrem .gt. 0) inc = 1 nn(j) = lseg + inc jcol(j+1) = jcol(j) + nn(j) lrem = lrem - 1 30 continue CVD$L CNCALL do 300 j = 1,nproc call matvec(m,nn(j),lda,a(1,jcol(j)),x(jcol(j)),yy(1,j)) 300 continue y(1:m) = y(1:m) + yy(1:m,1) + yy(1:m,2) + yy(1:m,3) * + yy(1:m,4) + yy(1:m,5) + yy(1:m,6) * + yy(1:m,7) + yy(1:m,8) endif c c c return end End of smxpyp8.f