# To unbundle, sh this file
echo ludpiv.f 1>&2
cat >ludpiv.f <<'End of ludpiv.f'
      double precision a(1001,1001),x(1001),b(1001),v(1001),mflops
      double precision emax
      integer ipvt(1001),output
c
c     Danny Sorensen and Jack Dongarra
c     Argonne National Laboratory
c
      lda = 1001
      output = 6
c      open(output,file = 'sore1.out')
c      rewind (output)
c
      write(output,40)
   40 format('  lu decomposition timing')
      do 200 n = 50,1000,50
         call matgen(a,lda,n,b)
         write(output,50)lda,n
   50    format(/' sp size of the arrays',i5,' and order is ',i5/)
c
         t1 = second(t)
         call dpgefa(a,lda,n,ipvt,v,ierr)
         t2 = second(t) - t1
         call lus(a,lda,n,ipvt,x,b)
         mflops = n
         mflops = mflops*(5+mflops*(-3+mflops*4))
         mflops = mflops/(t2*6.0d6)
         emax = 0.0
         do 111 i = 1,n
            emax = dmax1(emax,dabs(x(i)))
  111    continue
         if (ierr .ne. 0) emax = -ierr
         write(output,120) t2,mflops,emax
  120    format(' lux parallel version time = ',e12.3,' mflops = ',f9.4,
     $          ' check = ',e12.3)
c
  200 continue
      stop
      end
      subroutine matgen(a,lda,n,b)
      double precision a(lda,*),b(*)
c
      init = 1325
      do 30 j = 1,n
         do 20 i = 1,n
            init = mod(3125*init,65536)
            a(i,j) = (init - 32768.0)/16384.0
c           if (i .ne. j) then 
c              a(i,j) = 0
c           endif
c           if (j .gt. 1) a(1,j) = j
   20    continue
c           a(j,j) = 100
   30 continue
      do 35 i = 1,n
          b(i) = 0.0
   35 continue
      do 50 j = 1,n
         do 40 i = 1,n
            b(i) = b(i) + a(i,j)
   40    continue
   50 continue
      return
      end
      subroutine lus (a,lda,n,ipvt,x,b)
      integer ipvt(*)
      double precision a(lda,*), x(*), b(*), xk
c
c   purpose:
c     solve the linear system ax = b given the lu factorization of a (as
c     computed in lu).
c
c   additional parameters (not parameters of lu):
c
c     x double precision(n), solution of linear system
c
c     b double precision(n), right-hand-side of linear system
c
c ----------------------------------------------------------------------
c
      do 10 k = 1, n
       x(k) = b(k)
   10 continue
c
c
      do 40 k = 1, n
       l = ipvt(k)
       xk = x(l)
       x(l) = x(k)
       x(k) = xk
       do 30 i = k+1, n
          x(i) = x(i) - a(i,k)*xk
   30    continue
   40 continue
c
      do 60 k = n, 1, -1
         xk = x(k)*a(k,k)
         do 50 i = 1, k-1
            x(i) = x(i) - a(i,k)*xk
   50    continue
         x(k) = xk
   60 continue
c
      return
      end
      subroutine lux(a,lda,n,ipvt,v,info)
c ** lu.f -- lu decomposition
c
c
      integer lda,n,ipvt(n),info
      integer kpiv(20)
      double precision a(lda,*),v(1),t
c
      info = 0
      iblksz = 7
      incount = 1
      do 40 j = 1, n, iblksz
c
c       search for pivot
c
         ibksm1 = iblksz - 1
         if (n-j-ibksm1 .lt. 8) ibksm1 = n-j
         icount = 1
         do 30 k = j, j + ibksm1
            v(k:n) = abs(a(k:n,k))
            i = firstmaxoffset(v(k:n)) + k
            t = dabs(a(i,k))
c           write(6,*) ' i,t = ',i,t
            ipvt(k) = i
            kpiv(icount) = i - k + icount
            icount = icount + 1
c
c           test for zero pivot
c
            if (t .lt. 1.0d-14) then
               info = k
               go to 50
            endif
c
c          interchange rows
c
           do 20 jj = j,j+ibksm1
              t = a(k,jj)
              a(k,jj) = a(i,jj)
              a(i,jj) = t
   20      continue 
c
c          form k-th column of l
c
           kp1 = k + 1
           t = 1.0d0/a(k,k)
           incount = incount + 1
           a(kp1:n,k) = t*a(kp1:n,k)
           a(k,k) = t
           do 25 jj = kp1,j+ibksm1
              t = -a(k,jj)
              a(kp1:n,jj) = a(kp1:n,jj) + t*a(kp1:n,k)
   25      continue 
   30   continue
c
c       form the reduced submatrix in parallel
c
        if (k .le. n) 
     *  call reduce(n-j+1,iblksz,lda,a(j,j),kpiv)
c
c          put l back into standard form
c
c       do 110 ii = k-1,j,-1
c          kk = ipvt(ii)
c          do 120 jj = j,ii-1           
c             t = a(kk,jj)
c             a(kk,jj) = a(ii,jj)
c             a(ii,jj) = t
c 120      continue 
c 110   continue
        if (k .gt. n) go to 50
   40 continue
   50 continue
      return
      end
End of ludpiv.f
echo lufac.f 1>&2
cat >lufac.f <<'End of lufac.f'
      subroutine dpgefa(a,lda,n,ipvt,v,info)
c***********************************************************************
c
c     this routine factors a real n by n matrix a into the product of lower and 
c     upper triangular matrices 
c
c               a = lu
c
c      where p is a permutation matrix, l is unit lower triangular, and 
c      u is upper triangular.  this is a parallel-vector algorithm.  
c      the algorithm is described in "".
c
c      on entry
c   
c         a     double precision (lda,n)
c               contains the matrix to be factored
c
c         lda   integer
c               the leading dimension of the array a
c
c         n     integer
c               the order of the matrix contained in a
c
c     on return
c
c         a     an upper triangular matrix and the multipliers which were used
c               to obtain it. 
c               the factorization can be written  a = l*u where 
c               l is the product of permutation and unit lower triangular 
c               matrices and u is upper triangular.
c
c         ipvt  integer(n)
c               an integer vector of pivot indices
c
c         v     double precision(n)
c               a work array
c
c         info  integer
c               = 0 normal completion
c               = k if u(k,k) .eq. 0.0.  this is not an error condition for 
c                 for this subroutine, but it does indecate that the 
c                 triangular solution routine will fail.  this is not a 
c                 numerically reliable indication of singularity.  one
c                 should use the condition estimator  from LINPACK to 
c                 obtain reliable estimates of near singularity.
c
c
c     Danny Sorensen and Jack Dongarra
c     Argonne National Laboratory
c
c
      integer lda,n,ipvt(n),info
      integer kpiv(20)
      double precision a(lda,*),v(n),t
c
      info = 0
      iblksz = 7
      incount = 1
      do 40 j = 1, n, iblksz
c
c       search for pivot
c
         ibksm1 = iblksz - 1
         if (n-j-ibksm1 .lt. 8) ibksm1 = n-j
         icount = 1
         do 30 k = j, j + ibksm1
            v(k:n) = abs(a(k:n,k))
            i = firstmaxoffset(v(k:n)) + k
            t = dabs(a(i,k))
            ipvt(k) = i
            kpiv(icount) = i - k + icount
            icount = icount + 1
c
c           test for zero pivot
c
            if (t .lt. 1.0d-14) then
               info = k
               go to 50
            endif
c
c          interchange rows
c
           do 20 jj = j,j+ibksm1
              t = a(k,jj)
              a(k,jj) = a(i,jj)
              a(i,jj) = t
   20      continue 
c
c          form k-th column of l
c
           kp1 = k + 1
           t = 1.0d0/a(k,k)
           incount = incount + 1
           a(kp1:n,k) = t*a(kp1:n,k)
           a(k,k) = t
           do 25 jj = kp1,j+ibksm1
              t = -a(k,jj)
              a(kp1:n,jj) = a(kp1:n,jj) + t*a(kp1:n,k)
   25      continue 
   30   continue
c
c       form the reduced submatrix in parallel
c
        if (k .le. n) 
     *  call reduce(n-j+1,iblksz,lda,a(j,j),kpiv)
c
c          put l back into standard form
c
        do 110 ii = k-1,j,-1
           kk = ipvt(ii)
           do 120 jj = j,ii-1           
              t = a(kk,jj)
              a(kk,jj) = a(ii,jj)
              a(ii,jj) = t
  120      continue 
  110   continue
        if (k .gt. n) go to 50
   40 continue
   50 continue
      return
      end
End of lufac.f
echo makefile 1>&2
cat >makefile <<'End of makefile'
FILES = lufac.o ludpiv.o smxpyp8.o prepare.o reduce.o second2.o matvec8.o mmul7.o 
run :	$(FILES)
	fortran -O -AS -recursive $(FILES) -o run

.f.o : ;	fortran -O -AS -recursive -c $*.f
End of makefile
echo matvec8.s 1>&2
cat >matvec8.s <<'End of matvec8.s'
|                        Machine Code Listing of matvec.f 
|
|                       0 4 8   12 16 20
|     subroutine matvec(m,n,lda,a ,x, y )
|     double precision a(lda,1),x(1),y(1)
|     do 100 j = 1,n
|           y(1:m) = a(1:m,j)*x(j)
| 100 continue
|     return
|     end
|
|     Danny Sorensen 
|     Argonne National Laboratory
|
|     
       .data
       .bss
       .text
_BTEXT:

| Literal Pool

       
       .globl _matvec_
_matvec_:                         |> 0000 
       link a6,#-40               |  grab stack space
                                  | LINE 2
       movl a0@(8),a2             |  put address of lda in a2
                                  | LINE 5
       movl a0@(4),a1             |  put address of n in a1
                                  | LINE 2
       movl a0@(20),a5            |  put address of y in a5
       movl a0@(12),a4            |  put address of a in a4
       movl a2@,d2                |  put value of lda in d2
                                  | LINE 5
       movl a0@,a3                |  put value of m in a3
       movl a3@,d4                |  into d4 to set vec length
       movl a0@(16),a3            |  move address of x into a3
       moveq #0,d0                |  d0 will point to row blocks of a     
       fmoveld d0,fp1             |  put 0 in fp1
       moveq #-1,d6               |  set vector mask to all ones
       moveq #1,d5                |  set stride to one
matvec.label_L1:
       vmoved fp1,v2              |  initialize result to 0
       movl a1@,d7                |  put value of n in d7
       subql #1,d7                |  initialize loop counter
       moveq #1,d3                |  d3 indexes the x vector
       movl  d0,d1                |  d1 indexes the columns of a
matvec.label_LE:
                                  | LINE 6
       fmoved a3@(-8:b)[d3:l:d],fp0
                                  | get jth component xj of x into fp0
       vmuadd fp0,a4@(0:b)[d1:l:d],v2,v2                
                                  | mult col of a by xj and add to result
                                  | LINE 7
       addl d5,d3                 | increment x index
                                  | LINE 6
       addl d2,d1                 | increment column index
       subql #1,d7                | decrement loop counter
                                  | LINE 9
       bges matvec.label_LE       |> 006A  6CCC
       vmoved v2,a5@(0:b)[d0:l:d] | write result to y
       moveq  #32,d7
       addl   d7,d0
       vcnt32 matvec.label_L1
       unlk a6                    |> 006C  4E5E
       rts                        |> 006E  4E75
End of matvec8.s
echo mmul7.s 1>&2
cat >mmul7.s <<'End of mmul7.s'

|                        Machine Code Listing of mmul.f 
|
|                     0 4 8      12  16 20 24
|     subroutine mmul(k,n,iblksz,lda,a, x, y )
|     double precision a(lda,1),x(lda,1),y(lda,1)
|     do 100 j = 1,n
|        do 50 i = 1,iblksz
|           y(1:k,j) = y(1:k,j) - a(1:k,i)*x(i,j)
|  50    continue
| 100 continue
|     return
|     end
|
|     Danny Sorensen 
|     Argonne National Laboratory
|
|     
       .data
       .bss
       .text
_BTEXT:

| Literal Pool

       
       .globl _mmul_
_mmul_:                           |> 0000 
       link a6,#-40               |  grab stack space
                                  | LINE 2
       movl a0@(12),a2            |  put address of lda in a2
                                  | LINE 5
       movl a0@(4),a1             |  put address of n in a1
                                  | LINE 2
       movl a0@(24),a5            |  put address of y in a5
       movl a0@(16),a4            |  put address of a in a4
       movl a2@,d2                |  put value of lda in d2
                                  | LINE 5
       movl a0@,a3                |  put value of k 
       movl a3@,d4                |  into d4 to set vec length
       movl a0@(20),a3            |  move address of x into a3
       moveq #0,d0                |  d0 will point to row blocks of a     
       moveq #-1,d6               |  set vector mask to all ones
       moveq #1,d5                |  set stride to one
mmul.label_L1:
       movl a1@,d7                |  put value of n in d7
       subql #1,d7                |  initialize loop counter
       movl  d0,d1                |  d1 indexes the columns of a
       vmoved a4@(0:b)[d1:l:d],v0
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v1
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v2
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v3
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v4
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v5
       addl d2,d1                 | increment column index
       vmoved a4@(0:b)[d1:l:d],v6
                                  | load columns of a into v0-v3
       movl  d0,d1                | d1 indexes the columns of y
       movl #0,d3                 | initialize column index for x
mmul.label_LE:
                                  | LINE 6
       vmoved a5@(0:b)[d1:l:d],v7 | read column of y   
       fnegd a3@(0:b)[d3:l:d],fp0 | move negative of x component into fp0
       vmuadd fp0,v0,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(8:b)[d3:l:d],fp0 | move negative of x component into fp0
       vmuadd fp0,v1,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(16:b)[d3:l:d],fp0 
                                  | move negative of x component into fp0
       vmuadd fp0,v2,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(24:b)[d3:l:d],fp0 
                                  | move negative of x component into fp0
       vmuadd fp0,v3,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(32:b)[d3:l:d],fp0 
                                  | move negative of x component into fp0
       vmuadd fp0,v4,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(40:b)[d3:l:d],fp0 
                                  | move negative of x component into fp0
       vmuadd fp0,v5,v7,v7        | multiply column of a by x and add to y
       fnegd a3@(48:b)[d3:l:d],fp0 
                                  | move negative of x component into fp0
       vmuadd fp0,v6,v7,v7        | multiply column of a by x and add to y
                                  | get jth column xj of x into fp0,fp1,fp2,fp3
                                  | mult col of a by xj and add to result
       vmoved v7,a5@(0:b)[d1:l:d] | write result to y
       addl d2,d1                 | increment column index for y
       addl d2,d3                 | increment column index for x
       subql #1,d7                | decrement loop counter
                                  | LINE 9
       bges mmul.label_LE         |> 006A  6CCC
       moveq  #32,d7
       addl   d7,d0
       vcnt32 mmul.label_L1
       unlk a6                    |> 006C  4E5E
       rts                        |> 006E  4E75
End of mmul7.s
echo prepare.f 1>&2
cat >prepare.f <<'End of prepare.f'
      subroutine prepar(n,iblksz,ldl,l,x,kpiv)
c*******************************************************************
c
c     this routine is called by dpgefa.  it prepares for block elimination
c     by performing pivoting and multiplying through by 
c     the appropriate lower triangular matrix.
c
c     on entry 
c 
c        n     integer
c              the order of the matrix
c
c     Danny Sorensen 
c     Argonne National Laboratory
c
CVD$G NOCONCUR
c
      integer ldl,iblksz,n,kpiv(*)
      double precision l(ldl,*),x(ldl,*)
      double precision t
c     write(6,*) ' ldl n ',ldl,n
      do 40 k = 1,iblksz
c
c        interchange rows
c
         i = kpiv(k)
c        write(6,*) ' i k ',i,k
         do 20 j = 1,n 
            t = x(k,j)
            x(k,j) = x(i,j)
            x(i,j) = t
   20    continue 
   40 continue
      do 41 k = 1,iblksz
c
c        modify the columns of x
c
         kp1 = k + 1
         do 25 j = 1,n
            t = -x(k,j)
            do 24 i = kp1,iblksz
               x(i,j) = x(i,j) + t*l(i,k)
   24       continue
   25    continue 
   41 continue
      return
      end
End of prepare.f
echo reduce.f 1>&2
cat >reduce.f <<'End of reduce.f'
      subroutine reduce(n,iblksz,lda,a,kpiv)
c
c     parallel version for the alliant
c
c
c     Danny Sorensen 
c     Argonne National Laboratory
c
      integer lda,n,kpiv(*)
      double precision a(lda,*)
      integer k,nproc,nn(9),jcol(9)       
      nproc = 8
      k = n - iblksz
c
c           doall loop
c
         lseg = k/nproc
         lrem = k - lseg*nproc
         jcol(1) = iblksz + 1
         ksum = 0
         do 30 j = 1,nproc
            inc = 0
            if (lrem .gt. 0) inc = 1
            nn(j) = lseg + inc
            jcol(j+1) = jcol(j) + nn(j)
            lrem = lrem - 1
             ksum = ksum + nn(j)
c           write(6,*) ' k, j, nn(j) ,ksum, ',k,j,nn(j),ksum
   30    continue
CVD$G CNCALL
         do 35 j = 1,nproc
            call prepar(nn(j),iblksz,lda,a(1,1),a(1,jcol(j)),kpiv)
   35    continue
         do 300 j = 1,nproc
            call mmul(k,nn(j),iblksz,lda,a(jcol(1),1),a(1,jcol(j)),
     *                                   a(jcol(1),jcol(j)))
  300    continue
c
c
c
         return
      end
End of reduce.f
echo second2.f 1>&2
cat >second2.f <<'End of second2.f'
      real function second(t)
      real t
      real t1(2)
      t = etime(t1)
      t = t1(1)
      second = t
      return
      end
    
End of second2.f
echo smxpyp8.f 1>&2
cat >smxpyp8.f <<'End of smxpyp8.f'

      subroutine smxpyp (m,y,n,lda,x,a)
c
c parallel version for the alliant
c
c
c     Danny Sorensen 
c     Argonne National Laboratory
c
      integer lda,m,n
      double precision y(m),x(n),a(lda,n)
      double precision yy(1000,8)
      integer nproc,nn(9),jcol(9)       
      nproc = 8
c
c     original loop
c
      if (n .lt. nproc .or. m .lt. nproc) then
         do 20 j = 1, n
            do 10 i = 1, m
               y(i) = y(i) + x(j)*a(i,j)
   10       continue
   20    continue
      else
c
c           doall loop
c
         lseg = n/nproc
         lrem = n - lseg*nproc
         jcol(1) = 1
         do 30 j = 1,nproc
            inc = 0
            if (lrem .gt. 0) inc = 1
            nn(j) = lseg + inc
            jcol(j+1) = jcol(j) + nn(j)
            lrem = lrem - 1
   30    continue
CVD$L CNCALL
         do 300 j = 1,nproc
            call matvec(m,nn(j),lda,a(1,jcol(j)),x(jcol(j)),yy(1,j))
  300    continue
         y(1:m) = y(1:m) + yy(1:m,1) + yy(1:m,2) + yy(1:m,3)
     *          + yy(1:m,4) + yy(1:m,5) + yy(1:m,6)
     *          + yy(1:m,7) + yy(1:m,8) 
      endif
c
c
c
         return
      end
End of smxpyp8.f