!Homework 14 !Programmer: Chad Brewbaker !Email address: crb002@iastate.edu !Date: April 8, 2004 !Machine used: hpc-class.iastate.edu (Intel Xenon/Myrinet cluster) !Compiler options used: implicit none integer,parameter :: cache=1024,ncache=cache*1024/8,nflush=ncache*4 double precision :: flush(nflush) ! cache = size of secondary cache in Kbytes ! ncache = number of 8 byte elements in the cache ! nflush = number of elements in the array flush that !   is used to flush the cache.  Since the Xeon !   microprocessor may a random replacement !   scheme, nflush is taken to be 4 times the !   number of elements needed to fill the cache. integer,parameter :: n=500, ntrial=10 double precision,dimension(ntrial) :: array_time, max_time, array_perf include "mpif.h" integer,parameter :: dp=mpi_double_precision, comm=mpi_comm_world integer :: p, rank, irank, i, j, ktrial, m, status(mpi_status_size) integer :: ierror, ier,newtype double precision :: t, y(n), u(n), A(n,n), x(n), z(n),Ay(n,n+1) double precision,allocatable,dimension(:,:) :: temp double precision,allocatable,dimension(:,:,:) :: tmp double precision,allocatable,dimension(:) :: yb integer,allocatable,dimension(:)::request, req integer,allocatable,dimension(:,:)::statuses call random_number(flush) call mpi_init(ierror) call mpi_comm_size(comm, p, ierror) m = n/p call mpi_comm_rank(comm, rank, ierror) allocate(temp(m,n+1)) allocate(tmp(m,n+1,p-1)) allocate(yb(m)) allocate(request(p-1)) allocate(req(p-1)) allocate(statuses(mpi_status_size,p-1)) ! Check to determine if p divides n: if (rank == 0) then print *, ' ' print *, 'this program assumes p divides n:' print *, ' p = ', p, ' and n = ', n endif !********* begin master section ************************************** if (rank == 0) then !   Initialize A, x & y on the master call random_number(A) call random_number(x) call random_number(y) Ay(1:n,1:n) = A(1:n,1:n) Ay(1:n,n+1) = y(1:n) ! Calculate z = y + Ax to check for program accuracy: z = y + matmul(A,x) u = y !  preserve the initial value of y endif ! ******** end master section *********************************** ! Initialize the flush array: call random_number(flush) do ktrial = 1, ntrial if (rank == 0) then y = u ! preserve the initial value of y for each trial endif flush = flush + 0.1d0 !  flush the cache call mpi_barrier(comm,ierror) t = mpi_wtime() !   Broadcast x to each processor: call mpi_bcast(x, n, dp, 0, comm, ierror) call MPI_Type_vector(n+1,m,n,dp,newtype,ierror) call MPI_Type_commit(newtype,ierror) !********** begin master section ************************************** if(rank == 0) then !   Comments on using nonblocking routines on processor 0: !   The posting of the sends of the arrays must be first and the mpi_waitall !   for can be placed anywhere since the tmp arrays are not used, so !   it was placed near the end of the work on this processor. !   Immediately after posting the sends, we post the receives. !   Post sending the y and A row blocks to the slave processors by first !   placing this information in the temporary array temp(1:m,1:(n+1)): irank = 1 do i = m+1, n, m call mpi_isend(Ay(i,1), 1, newtype, irank, 1, comm, request(irank), ierror) irank = irank + 1 enddo !   Post the receiving yb from the p-1 slave processors and assemble the new y: do i = 1, p-1 call mpi_irecv(y(i*m+1), m, dp, i, 2,comm, req(i), ierror) enddo !   The master now calculates the first block of y. call dgemv('n', m, n, 1.d0, A, n, x, 1, 1.d0, y, 1) ! optimal !   do j = 1, n ! y(1:m) = y(1:m) + A(1:m,j)*x(j) !   enddo call mpi_waitall(p-1, request, statuses, ierror) ! complete sends call mpi_waitall(p-1, req, statuses, ierror) ! complete receives endif ! ******************** end master section **************************** ! ******************** begin slave section *********************** if (rank > 0) then !   Receive temp from processor 0: call mpi_recv(temp, m*(n+1), dp, 0, 1, comm, status, ier) !   Calculate the new y value in yb(1:m): yb(1:m) = temp(1:m,n+1) call dgemv('n', m, n, 1.d0, temp, m, x, 1, 1.d0, yb, 1) ! optimal !   do j = 1, n ! yb(1:m) = yb(1:m) + temp(1:m,j)*x(j) !   enddo !   Send yb to the master processor: call mpi_send(yb, m, dp, 0, 2, comm, ierror) endif ! *********************** end slave section *************************** array_time(ktrial) = mpi_wtime() - t call mpi_barrier(comm,ierror) ! check answer when ktrial = 1: if (ktrial ==1) then if (rank ==0) then print *, 'Max absolute error = ', maxval(abs(y-z)) endif endif flush(ktrial) = flush(ktrial) + y(ktrial) enddo !   Take the max values of the local arrays array_time !   and place on processor 0: call mpi_reduce(array_time, max_time, ntrial, dp, & mpi_max, 0, comm, ierror) ! ********************* begin master **************************** if (rank == 0) then ! print *,'Max absolute error = ', maxval(abs(y-z)) print *,' ' print *,'Time in Seconds MFLOPS' array_perf = dble(2*n*n)*(1.0d-6)/max_time do ktrial = 1, ntrial print *,max_time(ktrial),' ',array_perf(ktrial) enddo print *,' ' print *,' ' endif ! ******************** end master ********************************** !   To eliminate the possibility of an optimizing compiler !   removing code from the above timing loop.  The barrier makes !   the following print statement execute after the above print !   statements. call mpi_barrier(mpi_comm_world,ierror) if (rank == 0) then print *,'flush(1)+array_time(1)+y(1) = ',flush(1)+array_time(1)+y(1) endif call mpi_finalize(ierror) endprogram !!$this program assumes p divides n: !!$ p = 4 and n = 500 !!$ Max absolute error = 2.41584530158434D-013 !!$ !!$ Time in Seconds MFLOPS !!$ 3.26981544494629D-002 15.2913829058084 !!$ 1.02458000183105D-002 48.8004840135896 !!$ 9.91797447204590D-003 50.4135195557586 !!$ 9.90891456604004D-003 50.4596135800390 !!$ 9.92298126220703D-003 50.3880826525709 !!$ 9.90200042724609D-003 50.4948473466243 !!$ 9.92703437805176D-003 50.3675096668828 !!$ 9.99498367309570D-003 50.0250942226039 !!$ 1.00097656250000D-002 49.9512195121951 !!$ 9.99402999877930D-003 50.0298678372060 !!$ !!$ !!$ flush(1)+array_time(1)+y(1) = 232.776240125174