!Homework 13 !Programmer: Chad Brewbaker !Email address: crb002@iastate.edu !Date: April 8, 2004 !Machine used: hpc-class.iastate.edu (Intel Xenon/Myrinet cluster) !Compiler options used: ! double precision :: A(n,n), y(n), Ay(n,n+1) ! Ay(1:n,1:n) = A(1:n,1:n) ! Ay(1:n,n+1) = y(1:n) implicit none integer,parameter :: cache=1024 ,ncache=cache*1024/8, & nflush=ncache*4 double precision :: flush(nflush) ! cache = size of secondary cache in Kbytes ! ncache = number of 8 byte elements in the cache ! nflush = number of elements in the array flush that ! is used to flush the cache. Since the Xeon ! microprocessor may use a random replacement ! scheme, nflush is taken to be 4 times the ! number of elements needed to fill the cache. integer,parameter :: n=500, ntrial=10 double precision,dimension(ntrial) :: max_time, array_time, array_perf include "mpif.h" integer,parameter :: dp=mpi_double_precision, comm=mpi_comm_world integer :: irank, i, j, ktrial, m, p, rank, newtype,status(mpi_status_size) integer :: ier, ierror double precision :: y(n), A(n,n), x(n), z(n), t,Ay(n,n+1) double precision,allocatable,dimension(:,:) :: temp double precision,allocatable,dimension(:) :: yb call random_number(flush) call mpi_init(ierror) call mpi_comm_size(comm, p, ierror) m = n/p call mpi_comm_rank(comm, rank, ierror) allocate(temp(m,n+1)) allocate(yb(m)) ! Check to determine if p divides n: if (rank .eq. 0) then print *, ' ' print *, 'this program assumes p divides n:' print *, ' p = ', p, ' and n = ', n endif !********* begin master section on processor 0 ******************** if (rank == 0) then ! Initialize A, x & y on the master call random_number(A) call random_number(x) call random_number(y) Ay(1:n,1:n) = A(1:n,1:n) Ay(1:n,n+1) = y(1:n) ! Calculate z = y + Ax to check for program accuracy: z = y + matmul(A,x) endif ! ******** end master section *********************************** call MPI_Type_vector(n+1,m,n,dp,newtype,ierror) call MPI_Type_commit(newtype,ierror) ! Initialize the flush array: call random_number(flush) do ktrial = 1, ntrial flush = flush + 0.1d0 ! flush the cache call mpi_barrier(comm,ierror) t = mpi_wtime() ! Broadcast x to each processor: call mpi_bcast(x(1), n, dp, 0, comm, ierror) !********** begin master section on processor 0 ****************************** if (rank == 0) then ! Send the y and A row blocks to the other processors. irank = 1 do i = m+1, n, m call MPI_Send(Ay(i,1), 1, newtype, irank, 1, comm, ierror) irank = irank + 1 enddo ! The processor 0 now calculates its portion of y = y + Ax: ! do j = 1, n ! y(1:m) = y(1:m) + A(1:m,j)*x(j) ! enddo call dgemv('n', m, n, 1.d0, A(1,1), n, x(1), 1, 1.d0, y(1), 1) ! y(1:m) = y(1:m) + matmul(A(1:m,1:n),x) ! NOT optimal ! Receive yb from the p-1 slave processors and assemble the new y: do i = 1, p-1 call MPI_Recv(yb(1), m, dp, mpi_any_source, 1, comm, status, ierror) j = status(mpi_source)*m + 1 y(j:j+m-1) = yb(1:m) enddo endif ! ******************** end master section **************************** ! ******************** begin slave section, i.e. rank > 0 ************ if (rank > 0) then ! Receive temp from processor 0: call mpi_recv(temp(1,1), m*(n+1), dp, 0, 1, comm, status, ierror) ! Calculate the new y value in yb(1:m). ! yb(1:m) = temp(1:m,n+1) ! do j = 1, n ! yb(1:m) = yb(1:m) + temp(1:m,j)*x(j) ! enddo call dgemv('n', m, n, 1.d0, temp(1,1), m, x, 1, 1.d0, temp(1,n+1), 1) ! yb = yb + matmul(temp,x) ! should be optimal ! Send yb to the master processor: call mpi_send(temp(1,n+1), m, dp, 0, 1, comm, ierror) endif ! *********************** end slave section *************************** array_time(ktrial) = mpi_wtime() - t call mpi_barrier(comm,ierror) flush(ktrial) = flush(ktrial) + y(ktrial) ! Check for errors when ktrial=1: if (ktrial == 1) then if (rank == 0) then print *,'Max absolute error = ', maxval(abs(y-z)) endif endif enddo ! Take the max values of the local arrays array_time ! and place on processor 0: call mpi_reduce(array_time, max_time, ntrial, dp, & mpi_max, 0, comm, ierror) ! ********************* begin master **************************** if (rank == 0) then print *,' ' print *,'Time in Seconds MFLOPS' array_perf = dble(2*n*n)*(1.0d-6)/max_time do ktrial = 1, ntrial print *,max_time(ktrial),' ',array_perf(ktrial) enddo print *,' ' print *,' ' endif ! ******************** end master ********************************** ! To eliminate the possibility of an optimizing compiler ! removing code from the above timing loop.  The barrier makes ! the following print statement execute after the above print ! statements. call mpi_barrier(mpi_comm_world,ierror) if ( rank == 0 ) then print *, 'flush(1) + max_time(1) = ', flush(1)+max_time(1) endif call mpi_finalize(ierror) endprogram !!$vincent% mpirun -np 4 hw13.exe !!$ !!$ this program assumes p divides n: !!$ p = 4 and n = 500 !!$ Max absolute error = 2.41584530158434D-013 !!$ !!$ Time in Seconds MFLOPS !!$ 2.37500667572021D-002 21.0525724037545 !!$ 1.01819038391113D-002 49.1067297335269 !!$ 1.00390911102295D-002 49.8053055311468 !!$ 9.98115539550781D-003 50.0944009172559 !!$ 9.94086265563965D-003 50.2974457368989 !!$ 9.96208190917969D-003 50.1903120811794 !!$ 9.91582870483398D-003 50.4244289492666 !!$ 1.00510120391846D-002 49.7462343145860 !!$ 9.97805595397949D-003 50.1099615301904 !!$ 1.00550651550293D-002 49.7261820078722 !!$ !!$ !!$ flush(1) + max_time(1) = 117.207267615063 !!$vincent%