!_______________________________________________________________________________ ! compilation: ! $ pgf95d prob4_w_issue ! where alias "pgf95d" is defined for your use on art-1 as: ! pgf95 -mp -Mcuda=fastmath,cc35,cc50,cc60,fma,unroll,flushz,lineinfo -ta=nvidia \ ! -tp=haswell -fast -Mcudalib=curand -O2 -Minfo=par -mcmodel=medium \ ! -I$DISLIN/pgf -ldislin !*.f95 -o !*.x -L/usr/local/cuda-9.2/lib64 \ ! -lcufft -lcupti !_______________________________________________________________________________ module mydata use cudafor implicit none integer, parameter :: N = 1024, M = 1024 ! sizes of a 2-D array real :: grid(0:N+1,0:M+1) ! grid allocation on cpu real, device :: dgrid(0:N+1,0:M+1) ! grid allocation on gpu contains ! CUDA kernel attributes(global) subroutine Laplace_gpu (it) use cudafor integer, value :: it integer, device :: i, j i = Threadidx%x ! j=1...N j = Blockidx%x ! i=1...M dgrid(i,j) = (dgrid(i,j) + dgrid(i-1,j) + dgrid(i+1,j) + & dgrid(i,j-1) + dgrid(i,j+1))/5. end subroutine ! Slices w/omp subroutine Laplace_cpu_sl (it) integer :: i, it !$omp parallel do num_threads(12) do i = 1,N grid(i,1:M) = (grid(i,1:M) + grid(i,0:M-1) + grid(i,2:M+1) + & grid(i-1,1:M) + grid(i+1,1:M))/5. end do end subroutine ! OMP version subroutine Laplace_cpu_omp (it) integer :: i, j, it !$omp parallel do collapse(2) num_threads(12) do j = 1,M do i = 1,N grid(i,j) = ( grid(i,j) + grid(i-1,j) + grid(i+1,j) + & grid(i,j-1) + grid(i,j+1) )/5. end do end do end subroutine ! straight PGI fortran-compiled version for cpu subroutine Laplace_cpu (it) integer :: i, j, it do j = 1,M do i = 1,N grid(i,j) = ( grid(i,j) + grid(i-1,j) + grid(i+1,j) + & grid(i,j-1) + grid(i,j+1) )/5. end do end do end subroutine subroutine initialize_grid () ! initialize grid, use same, arbitrary values, for every trial grid = 0.1 ! all values set to 0.1 grid(502:522, 500:524) = 1. ! some values near the center set to 1. end subroutine end module mydata !_______________________________________________ ! ! main program !_______________________________________________ program smooth use omp_lib use mydata implicit none real :: g integer :: iter ! CUDA kernel if(cudaSetDevice(1) /=0) print*,'dev not set!' ! choosing gpu 1 call initialize_grid() dgrid = grid ! transfer grid to device do iter = -1, 10 call Laplace_gpu <<>> (iter) if(cudaDeviceSynchronize() /= 0) print*,iter,' nonsync' ! this line is ok end do ! iter grid = dgrid ! transfer results to host g = sum(grid(508:512,502:512)) ! a measure of results on part of array print*,' result =',g,' CUDA kernel' ! host code w/OMP + slices call initialize_grid() do iter = -1, 10 call Laplace_cpu_sl (iter) end do ! iter g = sum(grid(508:512,502:512)) ! a measure of results on part of array print*,' result =',g,' host OMP slices' ! host code w/OMP call initialize_grid() do iter = -1, 10 call Laplace_cpu_omp (iter) end do ! iter g = sum(grid(508:512,502:512)) ! a measure of results on part of array print*,' result =',g,' host OMP' ! straight host code w/o hand-coded OMP call initialize_grid() do iter = -1, 10 call Laplace_cpu (iter) end do ! iter g = sum(grid(508:512,502:512)) ! a measure of results on part of array print*,' result =',g,' host (CPU)' end program smooth