! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . Implicit real*8 (a-h,o-z) logical parallel integer count1,count2,count_rate,count_max parameter(parallel=.true.,ndat=16,ntime=10) parameter(itest=3) include 'mpif.h' dimension tuma(2),suma(2) ! distributed Fourier space wavefunction REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zf INTEGER, ALLOCATABLE, DIMENSION(:,:) :: keypdist ! distributed real space wavefunction REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zr ! distributed real space potential REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: pot,rho !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface INTEGER, DIMENSION(7,104) :: idata data ((idata(i,j),i=1,7),j=1,104) / & 3, 3, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, & 5, 5, 1, 1, 1, 1, 1, 6, 6, 1, 1, 1, 1, 1, & 8, 8, 1, 1, 1, 1, 1, 9, 3, 3, 1, 1, 1, 1, & 10, 5, 2, 1, 1, 1, 1, & 12, 4, 3, 1, 1, 1, 1, 15, 5, 3, 1, 1, 1, 1, & 16, 4, 4, 1, 1, 1, 1, 18, 6, 3, 1, 1, 1, 1, & 20, 5, 4, 1, 1, 1, 1, 24, 8, 3, 1, 1, 1, 1, & 25, 5, 5, 1, 1, 1, 1, 27, 3, 3, 3, 1, 1, 1, & 30, 6, 5, 1, 1, 1, 1, 32, 8, 4, 1, 1, 1, 1, & 36, 4, 3, 3, 1, 1, 1, 40, 8, 5, 1, 1, 1, 1, & 45, 5, 3, 3, 1, 1, 1, 48, 4, 4, 3, 1, 1, 1, & 50, 5, 5, 2, 1, 1, 1, & 54, 6, 3, 3, 1, 1, 1, 60, 5, 4, 3, 1, 1, 1, & 64, 8, 8, 1, 1, 1, 1, 72, 8, 3, 3, 1, 1, 1, & 75, 5, 5, 3, 1, 1, 1, 80, 5, 4, 4, 1, 1, 1, & 81, 3, 3, 3, 3, 1, 1, 90, 6, 5, 3, 1, 1, 1, & 96, 8, 4, 3, 1, 1, 1, 100, 5, 5, 4, 1, 1, 1, & 108, 4, 3, 3, 3, 1, 1, 120, 8, 5, 3, 1, 1, 1, & 125, 5, 5, 5, 1, 1, 1, 128, 8, 4, 4, 1, 1, 1, & 135, 5, 3, 3, 3, 1, 1, 144, 6, 8, 3, 1, 1, 1, & 150, 6, 5, 5, 1, 1, 1, 160, 8, 5, 4, 1, 1, 1, & 162, 6, 3, 3, 3, 1, 1, 180, 5, 4, 3, 3, 1, 1, & 192, 6, 8, 4, 1, 1, 1, 200, 8, 5, 5, 1, 1, 1, & 216, 8, 3, 3, 3, 1, 1, 225, 5, 5, 3, 3, 1, 1, & 240, 6, 8, 5, 1, 1, 1, 243, 3, 3, 3, 3, 3, 1, & 250, 5, 5, 5, 2, 1, 1, & 256, 8, 8, 4, 1, 1, 1, 270, 6, 5, 3, 3, 1, 1, & 288, 8, 4, 3, 3, 1, 1, 300, 5, 5, 4, 3, 1, 1, & 320, 5, 4, 4, 4, 1, 1, 324, 4, 3, 3, 3, 3, 1, & 360, 8, 5, 3, 3, 1, 1, 375, 5, 5, 5, 3, 1, 1, & 384, 8, 4, 4, 3, 1, 1, 400, 5, 5, 4, 4, 1, 1, & 405, 5, 3, 3, 3, 3, 1, 432, 4, 4, 3, 3, 3, 1, & 450, 6, 5, 5, 3, 1, 1, 480, 8, 5, 4, 3, 1, 1, & 486, 6, 3, 3, 3, 3, 1, 500, 5, 5, 5, 4, 1, 1, & 512, 8, 8, 8, 1, 1, 1, 540, 5, 4, 3, 3, 3, 1, & 576, 4, 4, 4, 3, 3, 1, 600, 8, 5, 5, 3, 1, 1, & 625, 5, 5, 5, 5, 1, 1, 640, 8, 5, 4, 4, 1, 1, & 648, 8, 3, 3, 3, 3, 1, 675, 5, 5, 3, 3, 3, 1, & 720, 5, 4, 4, 3, 3, 1, 729, 3, 3, 3, 3, 3, 3, & 750, 6, 5, 5, 5, 1, 1, 768, 4, 4, 4, 4, 3, 1, & 800, 8, 5, 5, 4, 1, 1, 810, 6, 5, 3, 3, 3, 1, & 864, 8, 4, 3, 3, 3, 1, 900, 5, 5, 4, 3, 3, 1, & 960, 5, 4, 4, 4, 3, 1, 972, 4, 3, 3, 3, 3, 3, & 1000, 8, 5, 5, 5, 1, 1, 1024, 4, 4, 4, 4, 4, 1, & 1080, 8, 5, 3, 3, 3, 1, & 1152, 8, 4, 4, 3, 3, 1, 1200, 5, 5, 4, 4, 3, 1, & 1250, 5, 5, 5, 5, 2, 1, 1280, 5, 4, 4, 4, 4, 1, & 1296, 4, 4, 3, 3, 3, 3, 1350, 6, 5, 5, 3, 3, 1, & 1440, 6, 5, 4, 4, 3, 1, 1458, 6, 3, 3, 3, 3, 3, & 1500, 5, 5, 5, 4, 3, 1, 1536, 8, 4, 4, 4, 3, 1, & 1600, 8, 8, 5, 5, 1, 1, 1620, 5, 4, 3, 3, 3, 3, & 1728, 8, 8, 3, 3, 3, 1, 1800, 8, 5, 5, 3, 3, 1, & 1920, 8, 5, 4, 4, 3, 1, 1944, 8, 3, 3, 3, 3, 3, & 2000, 5, 5, 5, 4, 4, 1, 2048, 8, 4, 4, 4, 4, 1 / ! Start MPI in parallel version if (parallel) then call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,iproc,ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr) ! write(6,*) 'mpi started',iproc,nproc else nproc=1 iproc=0 endif ! Start OpenMP npr=1 !$omp parallel default(private) shared(npr) !$ iam=omp_get_thread_num() !$ if (iam.eq.0) npr=omp_get_num_threads() !$omp end parallel if (iproc.eq.0) then open(unit=66,file='results',status='unknown') write(66,*) ' nproc ',nproc,' nthreads ',npr write(66,*) ' ndat ',ndat,' ntime ',ntime endif ! Dimension m1,m2,m3 of the Fourier space cube ! (Real space dimensions n1,n2,n3 are twice as large) ! There are 3 ways to set the dimension of the FFT. ! Uncomment the one you want !! 1) Hard coded dimensions m1=64 ; m2=64 ; m3=64 !! 2) Read dimensions ! if (iproc.eq.0) then ! write(6,*) 'm1=? , m2=? , m3=?' ! read*, m1 , m2 , m3 ! endif ! if (parallel) then ! call MPI_BCAST(m1,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr) ! call MPI_BCAST(m2,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr) ! call MPI_BCAST(m3,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr) ! write(6,*) iproc, m1 , m2 , m3 ! endif !! 3) loop over all possible dimensions ! do 9999,k1=1,104 ! m1=idata(1,k1) ! do 9999,k2=1,104 ! m2=idata(1,k2) ! do 9999,k3=1,104 ! m3=idata(1,k3) ! if (m1*m2*m3.le.64**3 .and. max(m1,m2,m3).le.128) then ! if (iproc.eq.0) then ! write(6,*) '' ! write(6,*) 'k1,k2,k3',k1,k2,k3 ! write(6,*) 'm1,m2,m3',m1,m2,m3 ! write(6,*) '' ! endif ! Fourier space grid dimensions (suitable for number of processors) md1=m1+0; md3=m3+0 md2=m2+0 150 if (nproc*(md2/nproc).lt.m2) then md2=md2+1 goto 150 endif ! real space grid dimension (suitable for number of processors) if (itest.eq.1) then n1=m1; n2=m2; n3=m3 else n1=2*m1; n2=2*m2; n3=2*m3 endif ! add buffer to avoid cache trashing nd1=n1+0; nd2=n2+0 nd3=n3+0 250 if (nproc*(nd3/nproc).lt.n3) then nd3=nd3+1 goto 250 endif if (itest.eq.1) then nd2=md2 md3=nd3 md1=nd1 endif if (itest.eq.1) then if (iproc.eq.0) write(6,*) 'testing ordinary FFT' allocate(zr(2,nd1,nd2,nd3/nproc,ndat),zf(2,nd1,nd3,nd2/nproc,ndat)) else if (itest.eq.2) then if (iproc.eq.0) write(6,*) 'testing wavefunction FFT' allocate(zr(2,nd1,nd2,nd3/nproc,ndat),zf(2,md1,md3,md2/nproc,ndat)) else if (itest.eq.3) then if (iproc.eq.0) write(6,*) 'testing APPLYPOT' allocate(zf(2,md1,md3,md2/nproc,ndat),pot(nd1,nd2,nd3/nproc)) do 68,i3=1,nd3/nproc do 68,i2=1,nd2 do 68,i1=1,nd1 68 pot(i1,i2,i3)=1.d0 else if (itest.eq.4) then if (iproc.eq.0) write(6,*) 'testing Accumulation of rho' allocate(zf(2,md1,md3,md2/nproc,ndat),rho(nd1,nd2,nd3/nproc)) else if (iproc.eq.0) write(6,*) 'UNDEFINED itest' if (parallel) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif if (itest.ge.2) then allocate(keypdist(2,n2/2)) do i2=1,n2/2 ! linear distribution ! keypdist(1,i2)=(i2-1)/(md2/nproc)+1 ! keypdist(2,i2)=mod(i2-1,md2/nproc)+1 ! modulo distribution keypdist(1,i2)=mod(i2-1,nproc)+1 keypdist(2,i2)=(i2-1)/nproc+1 if (iproc.eq.0) write(6,*) 'i2,keypdis',i2,keypdist(1,i2),keypdist(2,i2) enddo endif if (iproc.eq.0) then Print*,' ' Print*,' ' 46 format(a,3(2x,i4)) write(6,46) 'FFT TEST for n1 ,n2 ,n3 =',n1,n2,n3 write(6,46) ' nd1 ,nd2 ,nd3 =',nd1,nd2,nd3 write(6,46) ' md1 ,md2 ,md3 =',md1,md2,md3 write(6,*) ' nproc ',nproc,' nthreads ',npr write(6,*) ' ndat ',ndat,' ntime ',ntime Print*,' ' endif do idat=1,ndat if (itest.eq.1) then call initf1(idat,m1,m2,m3,md1,md2,md3,iproc,nproc,zf(1,1,1,1,idat)) else call initf(idat,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf(1,1,1,1,idat)) endif enddo call MPI_BARRIER(MPI_COMM_WORLD,ierr) !Correctness if (itest.eq.1) then call back(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zf,zr) ! call slice(nproc,iproc,n1,n2,n3,n1,n2,n3,nd1,nd2,nd3,nd1,nd2,nd3,zf,zr) call forw(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zr,zf) else if (itest.eq.2) then call back_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,zr) write(6,*) 'BACK finished' ! call slice(nproc,iproc,m1,m2,m3,n1,n2,n3,md1,md2,md3,nd1,nd2,nd3,zf,zr) call forw_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zr,zf) else if (itest.eq.3) then call applypot(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,pot,keypdist,zf) else if (itest.eq.4) then call accrho(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,rho) endif scal=1.d0/(n1*n2*n3) if (itest.lt.4) then do idat=1,ndat if (itest.eq.1) then call vgl1(idat,scal,m1,m2,m3,md1,md2,md3,iproc,nproc,zf(1,1,1,1,idat),tum) else call vgl(idat,scal,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf(1,1,1,1,idat),tum) endif if (nproc.gt.1) then call MPI_REDUCE(tum,sum,1,MPI_double_precision, & MPI_SUM,0,MPI_COMM_WORLD,ierr) else sum=tum endif if (iproc.eq.0) then write(6,*) 'error back-forward',idat,sqrt(sum) if (sqrt(sum).gt.1.d-10) then write(6,*) 'ALGEBRAIC ERROR' if (parallel) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif endif enddo else call fnorm(ndat,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf,tuma(1)) call rnorm(n1,n2,n3,nd1,nd2,nd3,iproc,nproc,rho,tuma(2)) if (nproc.gt.1) then call MPI_REDUCE(tuma,suma,2,MPI_double_precision, & MPI_SUM,0,MPI_COMM_WORLD,ierr) else suma(1)=tuma(1) suma(2)=tuma(2) endif if (iproc.eq.0) then tt=abs(1.d0-scal*suma(2)/suma(1)) write(6,*) 'rel DIFF. FOURIER, REAL NORM',tt if (tt.gt.1.d-10) then write(6,*) 'ALGEBRAIC ERROR' if (parallel) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif endif endif if (nproc.gt.1) call MPI_BARRIER(MPI_COMM_WORLD,ierr) ! Timings call cpu_time(t1) call system_clock(count1,count_rate,count_max) if (itest.eq.1) then do irep=1,ntime call back(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zf,zr) call forw(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zr,zf) enddo else if (itest.eq.2) then do irep=1,ntime call back_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,zr) call forw_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zr,zf) enddo else if (itest.eq.3) then do irep=1,ntime call applypot(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,pot,keypdist,zf) enddo else if (itest.eq.4) then do irep=1,ntime call accrho(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,rho) enddo endif call cpu_time(t2) call system_clock(count2,count_rate,count_max) time=(t2-t1)/(ntime*ndat) tela=(count2-count1)/(float(count_rate)*ntime*ndat) write(6,'(a,x,i4,2(x,e10.3))') 'iproc, CPU, elapsed time for forw + backw', & iproc,time,tela if (time*tela.ne.0.d0) then if (itest.eq.1) then flops=10.d0*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+log(1.d0*n1)/log(2.d0)+log(1.d0*n3)/log(2.d0)) else if (itest.eq.2) then flops=(10.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) else if (itest.eq.3) then flops=2*n1*n2*n3 + (10.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) else if (itest.eq.4) then flops=4*n1*n2*n3 + (5.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) endif if (iproc.eq.0) write(6,'(a,x,i4,(x,e12.5))') & 'iproc, Flops forw + backw',iproc,flops if (iproc.eq.0) write(6,'(a,2(x,f10.0))') 'speed (Mflops)', & 1.d-6*flops/time,1.d-6*flops/tela if (iproc.eq.0) write(66,'(i4,3(x,i4),2(x,e10.3),(x,f10.0))') & iproc,m1,m2,m3,time,tela,1.d-6*flops/tela endif if (itest.eq.1) then deallocate(zr,zf) else if (itest.eq.2) then deallocate(zr,zf,keypdist) else if (itest.eq.3) then deallocate(zf,pot,keypdist) else if (itest.eq.4) then deallocate(zf,rho,keypdist) endif ! endif !9999 continue if (parallel) call MPI_FINALIZE(ierr) end subroutine initf(idat,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf) implicit real*8 (a-h,o-z) dimension zf(2,md1,md3,md2/nproc),keypdist(2,m2) call zero(2*md1*md2*md3/nproc,zf) do 9766,j2=1,m2 kp2=keypdist(1,j2) k2=keypdist(2,j2) if (kp2-1.eq.iproc) then do 9763,j3=1,m3 do 9763,j1=1,m1 zf(1,j1,j3,k2) =sin(1.2d0*j1+2.3d0*j2+3.4d0*j3)*sqrt(.5d0*idat) zf(2,j1,j3,k2) =sin(5.6d0*j1+7.8d0*j2+9.0d0*j3) 9763 continue endif 9766 continue return end subroutine initf1(idat,m1,m2,m3,md1,md2,md3,iproc,nproc,zf) implicit real*8 (a-h,o-z) dimension zf(2,0:md1-1,0:md3-1,0:md2/nproc-1) do 9766,jp2=0,md2/nproc-1 j2=iproc*(md2/nproc)+jp2 if (j2.le.m2-1) then do 9763,j3=0,m3-1 do 9763,j1=0,m1-1 zf(1,j1,j3,jp2) =sin(1.2d0*j1+2.3d0*j2+3.4d0*j3)*sqrt(.5d0*idat) zf(2,j1,j3,jp2) =sin(5.6d0*j1+7.8d0*j2+9.0d0*j3) 9763 continue endif 9766 continue return end subroutine vgl(idat,scal,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf,sum) implicit real*8 (a-h,o-z) dimension zf(2,md1,md3,md2/nproc),keypdist(2,m2) sum=0.d0 do 9766,j2=1,m2 kp2=keypdist(1,j2) k2=keypdist(2,j2) if (kp2-1.eq.iproc) then do 9763,j3=1,m3 do 9763,j1=1,m1 t1=sin(1.2d0*j1+2.3d0*j2+3.4d0*j3)*sqrt(.5d0*idat) t2=sin(5.6d0*j1+7.8d0*j2+9.0d0*j3) diff=(scal*zf(1,j1,j3,k2)-t1)**2+(scal*zf(2,j1,j3,k2)-t2)**2 sum=sum+diff if (diff.gt.1.d-10) write(6,'(3(i3),6(x,e12.5))') & j1,j3,k2,scal*zf(1,j1,j3,k2),t1,scal*zf(2,j1,j3,k2),t2 9763 continue endif 9766 continue return end subroutine vgl1(idat,scal,m1,m2,m3,md1,md2,md3,iproc,nproc,zf,sum) implicit real*8 (a-h,o-z) dimension zf(2,0:md1-1,0:md3-1,0:md2/nproc-1) sum=0.d0 do 9766,jp2=0,md2/nproc-1 j2=iproc*(md2/nproc)+jp2 if (j2.le.m2-1) then do 9763,j3=0,m3-1 do 9763,j1=0,m1-1 t1=sin(1.2d0*j1+2.3d0*j2+3.4d0*j3)*sqrt(.5d0*idat) t2=sin(5.6d0*j1+7.8d0*j2+9.0d0*j3) diff=(scal*zf(1,j1,j3,jp2)-t1)**2+(scal*zf(2,j1,j3,jp2)-t2)**2 sum=sum+diff if (diff.gt.1.d-10) write(6,'(3(i3),6(x,e12.5))') & j1,j3,jp2,scal*zf(1,j1,j3,jp2),t1,scal*zf(2,j1,j3,jp2),t2 9763 continue endif 9766 continue return end subroutine rnorm(n1,n2,n3,nd1,nd2,nd3,iproc,nproc,rho,sum) implicit real*8 (a-h,o-z) dimension rho(nd1,nd2,nd3/nproc) sum=0.d0 do 9766,jp3=1,nd3/nproc j3=iproc*(nd3/nproc)+jp3 if (j3.le.n3) then do 9763,j2=1,n2 do 9763,j1=1,n1 sum=sum+rho(j1,j2,jp3) 9763 continue endif 9766 continue return end subroutine fnorm(ndat,m1,m2,m3,md1,md2,md3,iproc,nproc,keypdist,zf,sum) implicit real*8 (a-h,o-z) dimension zf(2,md1,md3,md2/nproc,ndat),keypdist(2,m2) sum=0.d0 do 9766,idat=1,ndat do 9766,j2=1,m2 kp2=keypdist(1,j2) k2=keypdist(2,j2) if (kp2-1.eq.iproc) then do 9763,j3=1,m3 do 9763,j1=1,m1 sum=sum+zf(1,j1,j3,k2,idat)**2+zf(2,j1,j3,k2,idat)**2 9763 continue endif 9766 continue return end ! FFT PART ----------------------------------------------------------------- subroutine back(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zf,zr) ! Adopt standard convention that isign=1 for backward transform ! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I2,I3)= ! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) ! in parallel using MPI/OpenMP and BLAS library calls. ! ! INPUT: ! ZF: input array ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! OUTPUT: ! ZR: output array ! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! nd1,nd2,nd3: Dimension of ZF and ZR ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2,n3) ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(2,nd1,nd2,nd3/nproc,ndat) :: zr ! Fourier space output REAL(KIND=8), DIMENSION(2,nd1,nd3,nd2/nproc,ndat) :: zf ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! find cache size that gives optimal performance on machine ncache=4*1024 ! check input if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(nd2,nproc).ne.0) stop 'ERROR:nd2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,iproc,nproc,ncache,zr,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2 if (mod(n2,2).eq.0) lzt=lzt+1 if (mod(n2,4).eq.0) lzt=lzt+1 !$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) !$omp end critical call ctrig(n3,trig3,after3,before3,now3,1,ic3) call ctrig(n1,trig1,after1,before1,now1,1,ic1) call ctrig(n2,trig2,after2,before2,now2,1,ic2) !$omp do do 12345,idat=1,ndat ! transform along z axis ! input: I1,I3,J2,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) stop 'enlarge ncache for z' do 3333,j2=1,nd2/nproc ! if (iproc*(nd2/nproc)+j2.le.n2) then do 3000,i1=1,n1,lot ma=i1 mb=min(i1+(lot-1),n1) nfft=mb-ma+1 ! input: I1,I3,J2,(Jp2) call fill(nd1,nd3,lot,nfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),1) inzee=3-inzee enddo ! input: I1,i3,J2,(Jp2) call scramble(i1,j2,lot,nfft,n1,n3,nd2,nproc,nd3,zw(1,1,inzee),zmpi2) ! output: I1,J2,i3,(Jp2) 3000 continue ! endif 3333 continue ! Interprocessor data transposition ! input: I1,J2,j3,jp3,(Jp2) if (nproc.gt.1) then 11 continue !$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11 call MPI_ALLTOALL(zmpi2,2*n1*(nd2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi1,2*n1*(nd2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 !$omp flush(lock) ! output: I1,J2,j3,Jp2,(jp3) endif do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2st=1 J2st=1 ! transform along x axis lot=ncache/(4*n1) if (lot.lt.1) stop 'enlarge ncache for x' do 1000,j=1,n2,lot ma=j mb=min(j+(lot-1),n2) nfft=mb-ma+1 ! input: I1,J2,j3,Jp2,(jp3) if (nproc.eq.1) then call mpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zmpi2,zw(1,1,1)) else call mpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zmpi1,zw(1,1,1)) endif ! output: J2,Jp2,I1,j3,(jp3) ! input: I2,I1,j3,(jp3) inzee=1 do i=1,ic1-1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),1) inzee=3-inzee enddo i=ic1 call fftstp(lot,nfft,n1,lzt,n1,zw(1,1,inzee),zt(1,j,1), & trig1,after1(i),now1(i),before1(i),1) ! output: I2,i1,j3,(jp3) 1000 continue ! transform along y axis lot=ncache/(4*n2) if (lot.lt.1) stop 'enlarge ncache for y' do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 ! input: I2,i1,j3,(jp3) call switch(nfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) ! output: i1,I2,j3,(jp3) inzee=1 do i=1,ic2-1 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),1) inzee=3-inzee enddo i=ic2 call fftstp(lot,nfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), & trig2,after2(i),now2(i),before2(i),1) 2000 continue ! output: i1,i2,j3,(jp3) endif 1212 continue 12345 continue !$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) !$omp end parallel return end subroutine switch(nfft,n2,lot,n1,lzt,zt,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),zt(2,lzt,n1) do 200,j=1,nfft do 100,i=1,n2 zw(1,j,i)=zt(1,i,j) zw(2,j,i)=zt(2,i,j) 100 continue 200 continue return end subroutine mpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zmpi1,zw) implicit real*8 (a-h,o-z) dimension zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc),zw(2,lot,n1) mfft=0 do 300,Jp2=Jp2st,nproc do 200,J2=J2st,nd2/nproc mfft=mfft+1 if (mfft.gt.nfft) then Jp2st=Jp2 J2st=J2 return endif do 100,I1=1,n1 zw(1,mfft,I1)=zmpi1(1,I1,J2,j3,Jp2) zw(2,mfft,I1)=zmpi1(2,I1,J2,j3,Jp2) 100 continue 200 continue J2st=1 300 continue end subroutine fill(nd1,nd3,lot,nfft,n3,zf,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zf(2,nd1,nd3) do 100,i3=1,n3 do 100,i1=1,nfft zw(1,i1,i3)=zf(1,i1,i3) zw(2,i1,i3)=zf(2,i1,i3) 100 continue return end subroutine forw(ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zr,zf) ! Adopt standard convention that isign=-1 for forward transform ! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I2,I3)= ! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) ! in parallel using MPI/OpenMP and BLAS library calls. ! INPUT: ! ZR: output array ! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! OUTPUT: ! ZF: input array ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! nd1,nd2,nd3: Dimension of ZR and ZF ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2,n3) ! ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(2,nd1,nd2,nd3/nproc,ndat) :: zr ! Fourier space output REAL(KIND=8), DIMENSION(2,nd1,nd3,nd2/nproc,ndat) :: zf ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! find cache size that gives optimal performance on machine ncache=4*1024 ! check input if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(nd2,nproc).ne.0) stop 'ERROR:nd2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,iproc,nproc,ncache,zr,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2 if (mod(n2,2).eq.0) lzt=lzt+1 if (mod(n2,4).eq.0) lzt=lzt+1 !$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) !$omp end critical call ctrig(n2,trig2,after2,before2,now2,-1,ic2) call ctrig(n1,trig1,after1,before1,now1,-1,ic1) call ctrig(n3,trig3,after3,before3,now3,-1,ic3) !$omp do do 12345,idat=1,ndat do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2st=1 J2st=1 ! transform along y axis ! input: i1,i2,j3,(jp3) lot=ncache/(4*n2) if (lot.lt.1) stop 'enlarge ncache for y' do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 i=1 call fftstp(nd1,nfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & trig2,after2(i),now2(i),before2(i),-1) inzee=1 do i=2,ic2 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),-1) inzee=3-inzee enddo ! input: i1,I2,j3,(jp3) call unswitch(nfft,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j)) ! output: I2,i1,j3,(jp3) 2000 continue ! transform along x axis ! input: I2,i1,j3,(jp3) lot=ncache/(4*n1) if (lot.lt.1) stop 'enlarge ncache for x' do 1000,j=1,n2,lot ma=j mb=min(j+(lot-1),n2) nfft=mb-ma+1 i=1 call fftstp(lzt,nfft,n1,lot,n1,zt(1,j,1),zw(1,1,1), & trig1,after1(i),now1(i),before1(i),-1) inzee=1 do i=2,ic1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),-1) inzee=3-inzee enddo ! output: I2,I1,j3,(jp3) ! input: J2,Jp2,I1,j3,(jp3) ! write(6,*) 'J2st,Jp2st',J2st,Jp2st if (nproc.eq.1) then call unmpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zw(1,1,inzee),zmpi2) else call unmpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zw(1,1,inzee),zmpi1) endif ! output: I1,J2,j3,Jp2,(jp3) 1000 continue endif 1212 continue ! Interprocessor data transposition ! intput: I1,J2,j3,Jp2,(jp3) if (nproc.gt.1) then 11 continue !$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11 call MPI_ALLTOALL(zmpi1,2*n1*(nd2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi2,2*n1*(nd2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 !$omp flush(lock) endif ! output: I1,J2,j3,jp3,(Jp2) ! transform along z axis ! input: I1,J2,i3,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) stop 'enlarge ncache for z' do 3333,j2=1,nd2/nproc ! if (iproc*(nd2/nproc)+j2.le.n2) then do 3000,i1=1,n1,lot ma=i1 mb=min(i1+(lot-1),n1) nfft=mb-ma+1 ! input: I1,J2,i3,(Jp2) call unscramble(i1,j2,lot,nfft,n1,n3,nd2,nproc,nd3,zmpi2,zw(1,1,1)) ! output: I1,i3,J2,(Jp2) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),-1) inzee=3-inzee enddo call unfill(nd1,nd3,lot,nfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) ! output: I1,I3,J2,(Jp2) 3000 continue ! endif 3333 continue 12345 continue !$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) !$omp end parallel return end subroutine unswitch(nfft,n2,lot,n1,lzt,zw,zt) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),zt(2,lzt,n1) do 100,j=1,nfft do 100,i=1,n2 zt(1,i,j)=zw(1,j,i) zt(2,i,j)=zw(2,j,i) 100 continue return end subroutine unmpiswitch(j3,nfft,Jp2st,J2st,lot,n1,nd2,nd3,nproc,zw,zmpi1) implicit real*8 (a-h,o-z) dimension zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc),zw(2,lot,n1) mfft=0 do 300,Jp2=Jp2st,nproc do 200,J2=J2st,nd2/nproc mfft=mfft+1 if (mfft.gt.nfft) then Jp2st=Jp2 J2st=J2 return endif do 100,I1=1,n1 zmpi1(1,I1,J2,j3,Jp2)=zw(1,mfft,I1) zmpi1(2,I1,J2,j3,Jp2)=zw(2,mfft,I1) 100 continue 200 continue J2st=1 300 continue end subroutine unfill(nd1,nd3,lot,nfft,n3,zw,zf) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zf(2,nd1,nd3) do 100,i3=1,n3 do 100,i1=1,nfft zf(1,i1,i3)=zw(1,i1,i3) zf(2,i1,i3)=zw(2,i1,i3) 100 continue return end subroutine back_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,zr) ! Does multiple 3-dim backward FFT's from Fourier into real space ! Adopt standard convention that isign=1 for backward transform ! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I2,I3)= ! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) ! in parallel using MPI/OpenMP and BLAS library calls. ! INPUT: ! ZF: input array ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1/2 , i2=1,n2/2 , i3=1,n3/2 , idat=1,ndat ! keypdist: The i2-th plane is stored on node keypdist(1,i2) = 1, ... , nproc ! The i2-th plane is the keypdist(2,i2)-th = 1, ... ,nd2/nproc on this node ! OUTPUT: ! ZR: output array ! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! md1,md2,md3: Dimension of ZF ! nd1,nd2,nd3: Dimension of ZR ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2/2,n3/2) ! ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(2,nd1,nd2,nd3/nproc,ndat) :: zr ! Fourier space output REAL(KIND=8), DIMENSION(2,md1,md3,md2/nproc,ndat) :: zf INTEGER, DIMENSION(2,n2/2) :: keypdist ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! find cache size that gives optimal performance on machine ncache=4*1024 ! check input if (mod(n1,2).ne.0) stop 'ERROR:n1' if (mod(n2,2).ne.0) stop 'ERROR:n2' if (mod(n3,2).ne.0) stop 'ERROR:n3' if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (md1.lt.n1/2) stop 'ERROR:md1' if (md2.lt.n2/2) stop 'ERROR:md2' if (md3.lt.n3/2) stop 'ERROR:md3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(md2,nproc).ne.0) stop 'ERROR:md2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,iproc,nproc,ncache,keypdist,zr,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2/2 if (mod(n2/2,2).eq.0) lzt=lzt+1 if (mod(n2/2,4).eq.0) lzt=lzt+1 !$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) !$omp end critical call ctrig(n3,trig3,after3,before3,now3,1,ic3) call ctrig(n1,trig1,after1,before1,now1,1,ic1) call ctrig(n2,trig2,after2,before2,now2,1,ic2) !$omp do do 12345,idat=1,ndat ! transform along z axis ! input: I1,I3,J2,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 3333,j2=1,md2/nproc ! if (iproc*(md2/nproc)+j2.le.n2/2) then do 3000,i1=1,(n1/2),lot ma=i1 mb=min(i1+(lot-1),(n1/2)) nfft=mb-ma+1 ! input: I1,I3,J2,(Jp2) call fill_corn(md1,md3,lot,nfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),1) inzee=3-inzee enddo ! input: I1,i3,J2,(Jp2) call scramble(i1,j2,lot,nfft,n1/2,n3,md2,nproc,nd3,zw(1,1,inzee),zmpi2) ! output: I1,J2,i3,(Jp2) 3000 continue ! endif 3333 continue ! Interprocessor data transposition ! input: I1,J2,j3,jp3,(Jp2) if (nproc.gt.1) then 11 continue !$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11 call MPI_ALLTOALL(zmpi2,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi1,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 !$omp flush(lock) ! output: I1,J2,j3,Jp2,(jp3) endif do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2st=1 J2st=1 ! transform along x axis lot=ncache/(4*n1) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 1000,j=1,n2/2,lot ma=j mb=min(j+(lot-1),n2/2) nfft=mb-ma+1 ! input: I1,J2,j3,Jp2,(jp3) if (nproc.eq.1) then call mpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi2,zw(1,1,1)) else call mpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi1,zw(1,1,1)) endif ! output: J2,Jp2,I1,j3,(jp3) ! input: I2,I1,j3,(jp3) inzee=1 do i=1,ic1-1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),1) inzee=3-inzee enddo i=ic1 call fftstp(lot,nfft,n1,lzt,n1,zw(1,1,inzee),zt(1,j,1), & trig1,after1(i),now1(i),before1(i),1) ! output: I2,i1,j3,(jp3) 1000 continue ! transform along y axis lot=ncache/(4*n2) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 ! input: I2,i1,j3,(jp3) call switch_corn(nfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) ! output: i1,I2,j3,(jp3) inzee=1 do i=1,ic2-1 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),1) inzee=3-inzee enddo i=ic2 call fftstp(lot,nfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), & trig2,after2(i),now2(i),before2(i),1) 2000 continue ! output: i1,i2,j3,(jp3) endif 1212 continue 12345 continue !$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) !$omp end parallel return end subroutine forw_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zr,zf) ! Does multiple 3-dim backward FFT's from real into Fourier space ! Adopt standard convention that isign=-1 for forward transform ! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I2,I3)= ! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) ! in parallel using MPI/OpenMP and BLAS library calls. ! INPUT: ! ZR: output array ! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) ! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat ! keypdist: The i2-th plane is stored on node keypdist(1,i2) = 1, ... ,nproc ! The i2-th plane is the keypdist(2,i2)-th = 1, ... ,nd2/nproc on this node ! OUTPUT: ! ZF: input array ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1/2 , i2=1,n2/2 , i3=1,n3/2 , idat=1,ndat ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! md1,md2,md3: Dimension of ZF ! nd1,nd2,nd3: Dimension of ZR ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2/2,n3/2) ! ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(2,nd1,nd2,nd3/nproc,ndat) :: zr ! Fourier space output REAL(KIND=8), DIMENSION(2,md1,md3,md2/nproc,ndat) :: zf INTEGER, DIMENSION(2,n2/2) :: keypdist ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! find cache size that gives optimal performance on machine ncache=4*1024 ! check input if (mod(n1,2).ne.0) stop 'ERROR:n1' if (mod(n2,2).ne.0) stop 'ERROR:n2' if (mod(n3,2).ne.0) stop 'ERROR:n3' if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (md1.lt.n1/2) stop 'ERROR:md1' if (md2.lt.n2/2) stop 'ERROR:md2' if (md3.lt.n3/2) stop 'ERROR:md3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(md2,nproc).ne.0) stop 'ERROR:md2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,iproc,nproc,ncache,keypdist,zr,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2/2 if (mod(n2/2,2).eq.0) lzt=lzt+1 if (mod(n2/2,4).eq.0) lzt=lzt+1 !$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) !$omp end critical call ctrig(n2,trig2,after2,before2,now2,-1,ic2) call ctrig(n1,trig1,after1,before1,now1,-1,ic1) call ctrig(n3,trig3,after3,before3,now3,-1,ic3) !$omp do do 12345,idat=1,ndat do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2st=1 J2st=1 ! transform along y axis ! input: i1,i2,j3,(jp3) lot=ncache/(4*n2) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 i=1 call fftstp(nd1,nfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & trig2,after2(i),now2(i),before2(i),-1) inzee=1 do i=2,ic2 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),-1) inzee=3-inzee enddo ! input: i1,I2,j3,(jp3) call unswitch_corn(nfft,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j)) ! output: I2,i1,j3,(jp3) 2000 continue ! transform along x axis ! input: I2,i1,j3,(jp3) lot=ncache/(4*n1) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 1000,j=1,n2/2,lot ma=j mb=min(j+(lot-1),n2/2) nfft=mb-ma+1 i=1 call fftstp(lzt,nfft,n1,lot,n1,zt(1,j,1),zw(1,1,1), & trig1,after1(i),now1(i),before1(i),-1) inzee=1 do i=2,ic1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),-1) inzee=3-inzee enddo ! output: I2,I1,j3,(jp3) ! input: J2,Jp2,I1,j3,(jp3) ! write(6,*) 'J2st,Jp2st',J2st,Jp2st if (nproc.eq.1) then call unmpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zw(1,1,inzee),zmpi2) else call unmpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zw(1,1,inzee),zmpi1) ! output: I1,J2,j3,Jp2,(jp3) endif 1000 continue endif 1212 continue ! Interprocessor data transposition ! intput: I1,J2,j3,Jp2,(jp3) if (nproc.gt.1) then 11 continue !$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11 call MPI_ALLTOALL(zmpi1,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi2,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 !$omp flush(lock) endif ! output: I1,J2,j3,jp3,(Jp2) ! transform along z axis ! input: I1,J2,i3,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 3333,j2=1,md2/nproc ! if (iproc*(md2/nproc)+j2.le.n2/2) then do 3000,i1=1,(n1/2),lot ma=i1 mb=min(i1+(lot-1),(n1/2)) nfft=mb-ma+1 ! input: I1,J2,i3,(Jp2) call unscramble(i1,j2,lot,nfft,n1/2,n3,md2,nproc,nd3,zmpi2,zw(1,1,1)) ! output: I1,i3,J2,(Jp2) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),-1) inzee=3-inzee enddo call unfill_corn(md1,md3,lot,nfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) ! output: I1,I3,J2,(Jp2) 3000 continue ! endif 3333 continue 12345 continue !$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) !$omp end parallel return end subroutine applypot(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,pot,keypdist,zf) ! Applies the local real space potential to multiple wafefunctions in Fourier space ! ZF: Wavefunction (input/output) ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1/2 , i2=1,n2/2 , i3=1,n3/2 , idat=1,ndat ! POT: Potential ! POT(i1,i2,i3) ! i1=1,n1 , i2=1,n2 , i3=1,n3 ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! md1,md2,md3: Dimension of ZF ! nd1,nd2,nd3: Dimension of POT ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2/2,n3/2) ! ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(nd1,nd2,nd3/nproc) :: pot ! Fourier space output REAL(KIND=8), DIMENSION(2,md1,md3,md2/nproc,ndat) :: zf INTEGER, DIMENSION(2,n2/2) :: keypdist ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: btrig1,btrig2,btrig3 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: ftrig1,ftrig2,ftrig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! check input if (mod(n1,2).ne.0) stop 'ERROR:n1' if (mod(n2,2).ne.0) stop 'ERROR:n2' if (mod(n3,2).ne.0) stop 'ERROR:n3' if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (md1.lt.n1/2) stop 'ERROR:md1' if (md2.lt.n2/2) stop 'ERROR:md2' if (md3.lt.n3/2) stop 'ERROR:md3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(md2,nproc).ne.0) stop 'ERROR:md2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,iproc,nproc,ncache,pot,keypdist,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() !$ if (nproc.gt.1 .and. npr.ne.2 .and. npr.ne.4) & !$ stop 'no communication scheduling provided in applypot' ncache=4*1024 ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2/2 if (mod(n2/2,2).eq.0) lzt=lzt+1 if (mod(n2/2,4).eq.0) lzt=lzt+1 !$omp critical allocate(btrig1(2,2048),ftrig1(2,2048),after1(7),now1(7),before1(7), & btrig2(2,2048),ftrig2(2,2048),after2(7),now2(7),before2(7), & btrig3(2,2048),ftrig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) !$omp end critical call ctrig(n3,btrig3,after3,before3,now3,1,ic3) call ctrig(n1,btrig1,after1,before1,now1,1,ic1) call ctrig(n2,btrig2,after2,before2,now2,1,ic2) do j=1,n1 ftrig1(1,j)= btrig1(1,j) ftrig1(2,j)=-btrig1(2,j) enddo do j=1,n2 ftrig2(1,j)= btrig2(1,j) ftrig2(2,j)=-btrig2(2,j) enddo do j=1,n3 ftrig3(1,j)= btrig3(1,j) ftrig3(2,j)=-btrig3(2,j) enddo ndatmod=((ndat+npr-1)/npr)*npr ! write(6,*) 'ndatmod',ndatmod do 12345,idat=1,ndatmod if (mod(idat-1,npr).eq.iam) then ! write(6,*) 'IN iam,idat',iam,idat if (idat.le.ndat) then ! transform along z axis ! input: I1,I3,J2,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 3331,j2=1,md2/nproc ! if (iproc*(md2/nproc)+j2.le.n2/2) then do 3001,i1=1,(n1/2),lot ma=i1 mb=min(i1+(lot-1),(n1/2)) nfft=mb-ma+1 ! input: I1,I3,J2,(Jp2) call fill_corn(md1,md3,lot,nfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & btrig3,after3(i),now3(i),before3(i),1) inzee=3-inzee enddo ! input: I1,i3,J2,(Jp2) call scramble(i1,j2,lot,nfft,n1/2,n3,md2,nproc,nd3,zw(1,1,inzee),zmpi2) ! output: I1,J2,i3,(Jp2) 3001 continue ! endif 3331 continue endif ! Interprocessor data transposition ! input: I1,J2,j3,jp3,(Jp2) if (nproc.gt.1) then 11 continue !$omp flush(lock) ! communication scheduling if ( (npr.eq.4 .and. iam.eq.0 .and. lock.eq.0) .or. & (npr.eq.4 .and. iam.eq.1 .and. lock.eq.1) .or. & (npr.eq.4 .and. iam.eq.2 .and. lock.eq.4) .or. & (npr.eq.4 .and. iam.eq.3 .and. lock.eq.5) .or. & (npr.eq.2 .and. iam.eq.0 .and. lock.eq.0) .or. & (npr.eq.2 .and. iam.eq.1 .and. lock.eq.2) .or. & npr.eq.1 ) then ! write(6,'(a,6(x,i3))') 'Applypot 1 ALLTOALL',nproc,iproc,npr,iam,lock,idat if (idat.le.ndat) & call MPI_ALLTOALL(zmpi2,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi1,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 if (npr.eq.4 .and. idat.eq.2) lock=lock+2 if (npr.eq.2 .and. idat.eq.1) lock=lock+1 lock=mod(lock,2*npr) ! write(6,'(a,6(x,i3))') 'new lock 1', nproc,iproc,npr,iam,lock !$omp flush(lock) else goto 11 endif ! output: I1,J2,j3,Jp2,(jp3) endif if (idat.le.ndat) then do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2stb=1 J2stb=1 Jp2stf=1 J2stf=1 ! transform along x axis lot=ncache/(4*n1) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 1001,j=1,n2/2,lot ma=j mb=min(j+(lot-1),n2/2) nfft=mb-ma+1 ! input: I1,J2,j3,Jp2,(jp3) if (nproc.eq.1) then call mpiswitch_corn(j3,nfft,Jp2stb,J2stb,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi2,zw(1,1,1)) else call mpiswitch_corn(j3,nfft,Jp2stb,J2stb,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi1,zw(1,1,1)) endif ! output: J2,Jp2,I1,j3,(jp3) ! input: I2,I1,j3,(jp3) inzee=1 do i=1,ic1-1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & btrig1,after1(i),now1(i),before1(i),1) inzee=3-inzee enddo i=ic1 call fftstp(lot,nfft,n1,lzt,n1,zw(1,1,inzee),zt(1,j,1), & btrig1,after1(i),now1(i),before1(i),1) ! output: I2,i1,j3,(jp3) 1001 continue ! transform along y axis lot=ncache/(4*n2) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 ! input: I2,i1,j3,(jp3) call switch_corn(nfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) ! output: i1,I2,j3,(jp3) inzee=1 do i=1,ic2 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & btrig2,after2(i),now2(i),before2(i),1) inzee=3-inzee enddo ! output: i1,i2,j3,(jp3) !Multiply with potential in real space call multpot(nd1,nd2,n2,lot,nfft,pot(j,1,j3),zw(1,1,inzee)) ! TRANSFORM BACK IN FOURIER SPACE ! transform along y axis ! input: i1,i2,j3,(jp3) do i=1,ic2 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & ftrig2,after2(i),now2(i),before2(i),-1) inzee=3-inzee enddo ! input: i1,I2,j3,(jp3) call unswitch_corn(nfft,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j)) ! output: I2,i1,j3,(jp3) 2000 continue ! transform along x axis ! input: I2,i1,j3,(jp3) lot=ncache/(4*n1) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 1002,j=1,n2/2,lot ma=j mb=min(j+(lot-1),n2/2) nfft=mb-ma+1 i=1 call fftstp(lzt,nfft,n1,lot,n1,zt(1,j,1),zw(1,1,1), & ftrig1,after1(i),now1(i),before1(i),-1) inzee=1 do i=2,ic1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & ftrig1,after1(i),now1(i),before1(i),-1) inzee=3-inzee enddo ! output: I2,I1,j3,(jp3) ! input: J2,Jp2,I1,j3,(jp3) if (nproc.eq.1) then call unmpiswitch_corn(j3,nfft,Jp2stf,J2stf,lot,n1,n2,md2,nd3,nproc,keypdist,zw(1,1,inzee),zmpi2) else call unmpiswitch_corn(j3,nfft,Jp2stf,J2stf,lot,n1,n2,md2,nd3,nproc,keypdist,zw(1,1,inzee),zmpi1) endif ! output: I1,J2,j3,Jp2,(jp3) 1002 continue endif 1212 continue endif ! Interprocessor data transposition ! intput: I1,J2,j3,Jp2,(jp3) if (nproc.gt.1) then 22 continue !$omp flush(lock) ! communication scheduling if ( (npr.eq.4 .and. iam.eq.0 .and. lock.eq.6) .or. & (npr.eq.4 .and. iam.eq.1 .and. lock.eq.7) .or. & (npr.eq.4 .and. iam.eq.2 .and. lock.eq.2) .or. & (npr.eq.4 .and. iam.eq.3 .and. lock.eq.3) .or. & (npr.eq.2 .and. iam.eq.0 .and. lock.eq.3) .or. & (npr.eq.2 .and. iam.eq.1 .and. lock.eq.1) .or. & npr.eq.1 ) then ! write(6,'(a,6(x,i3))') 'Applypot 2 ALLTOALL',nproc,iproc,npr,iam,lock,idat if (idat.le.ndat) & call MPI_ALLTOALL(zmpi1,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi2,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 if (npr.eq.4 .and. idat.eq.ndatmod-2) lock=lock+2 if (npr.eq.2 .and. idat.eq.ndatmod-1) lock=lock+1 lock=mod(lock,2*npr) ! write(6,'(a,6(x,i3))') 'new lock 2', nproc,iproc,npr,iam,lock !$omp flush(lock) else goto 22 endif ! output: I1,J2,j3,jp3,(Jp2) endif ! transform along z axis ! input: I1,J2,i3,(Jp2) if (idat.le.ndat) then lot=ncache/(4*n3) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 3332,j2=1,md2/nproc ! if (iproc*(md2/nproc)+j2.le.n2/2) then do 3002,i1=1,(n1/2),lot ma=i1 mb=min(i1+(lot-1),(n1/2)) nfft=mb-ma+1 ! input: I1,J2,i3,(Jp2) call unscramble(i1,j2,lot,nfft,n1/2,n3,md2,nproc,nd3,zmpi2,zw(1,1,1)) ! output: I1,i3,J2,(Jp2) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & ftrig3,after3(i),now3(i),before3(i),-1) inzee=3-inzee enddo call unfill_corn(md1,md3,lot,nfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) ! output: I1,I3,J2,(Jp2) 3002 continue ! endif 3332 continue endif endif 12345 continue deallocate(btrig1,ftrig1,after1,now1,before1, & btrig2,ftrig2,after2,now2,before2, & btrig3,ftrig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) !$omp end parallel return end subroutine multpot(nd1,nd2,n2,lot,nfft,pot,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),pot(nd1,nd2) ! n2 should be multiple of 2 if (2*(n2/2).ne.n2) stop 'multport' do 10, i2=1,n2-1,2 do 10, j=1,nfft zw(1,j,i2+0)=zw(1,j,i2+0)*pot(j,i2+0) zw(2,j,i2+0)=zw(2,j,i2+0)*pot(j,i2+0) zw(1,j,i2+1)=zw(1,j,i2+1)*pot(j,i2+1) zw(2,j,i2+1)=zw(2,j,i2+1)*pot(j,i2+1) 10 continue return end subroutine accrho(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,keypdist,zf,rho) ! Accumulates the real space density rho from the ndat wavefunctions zf ! by transforming zf into real space and adding all the amplitudes squared ! INPUT: ! ZF: input array ! real(F(i1,i2,i3,idat))=ZF(1,i1,i2,i3,idat) ! imag(F(i1,i2,i3,idat))=ZF(2,i1,i2,i3,idat) ! i1=1,n1/2 , i2=1,n2/2 , i3=1,n3/2 , idat=1,ndat ! keypdist: The i2-th plane is stored on node keypdist(1,i2) = 1, ... ,nproc ! The i2-th plane is the keypdist(2,i2)-th = 1, ... ,nd2/nproc on this node ! OUTPUT: ! RHO(i1,i2,i3) ! i1=1,n1 , i2=1,n2 , i3=1,n3 ! nproc: number of processors used as returned by MPI_COMM_SIZE ! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK ! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed. ! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG ! nd1,nd2,nd3: Dimension of ZR ! ! PERFORMANCE CONSIDERATIONS: ! The maximum number of processors that can reasonably be used is max(n2/2,n3/2) ! ! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that ! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers. ! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message. ! ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) include 'mpif.h' ! real space input REAL(KIND=8), DIMENSION(nd1,nd2,nd3/nproc) :: rho REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: rhopart ! Fourier space output REAL(KIND=8), DIMENSION(2,md1,md3,md2/nproc,ndat) :: zf INTEGER, DIMENSION(2,n2/2) :: keypdist ! work arrays for transpositions REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zt ! work arrays for MPI REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zmpi1 REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2 ! cache work array REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: zw ! FFT work arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 ! buffer arrays REAL(KIND=8), ALLOCATABLE, DIMENSION(:) :: buf1,buf2,buf3 !$ interface !$ integer ( kind=4 ) function omp_get_num_threads ( ) !$ end function omp_get_num_threads !$ end interface !$ interface !$ integer ( kind=4 ) function omp_get_thread_num ( ) !$ end function omp_get_thread_num !$ end interface ! find cache size that gives optimal performance on machine ncache=4*1024 ! check input if (mod(n1,2).ne.0) stop 'ERROR:n1' if (mod(n2,2).ne.0) stop 'ERROR:n2' if (mod(n3,2).ne.0) stop 'ERROR:n3' if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' if (md1.lt.n1/2) stop 'ERROR:md1' if (md2.lt.n2/2) stop 'ERROR:md2' if (md3.lt.n3/2) stop 'ERROR:md3' if (mod(nd3,nproc).ne.0) stop 'ERROR:nd3' if (mod(md2,nproc).ne.0) stop 'ERROR:md2' lock=0 !$omp parallel default(private) & !$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,iproc,nproc,ncache,rho,keypdist,zf,lock) iam=0 npr=1 !$ iam=omp_get_thread_num() !$ npr=omp_get_num_threads() ! pagesize + L2 cacheline size + L1 cacheline size nbuf=1024+8+4 lzt=n2/2 if (mod(n2/2,2).eq.0) lzt=lzt+1 if (mod(n2/2,4).eq.0) lzt=lzt+1 !$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),buf1(nbuf),zt(2,lzt,n1), & buf2(nbuf),zmpi2(2,n1,nd2/nproc,nd3),buf3(nbuf)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2/nproc,nd3/nproc,nproc)) if (npr.gt.1) allocate(rhopart(nd1,nd2,nd3/nproc)) !$omp end critical if (npr.gt.1) call zero(nd1*nd2*nd3/nproc,rhopart) call zero(nd1*nd2*nd3/nproc,rho) call ctrig(n3,trig3,after3,before3,now3,1,ic3) call ctrig(n1,trig1,after1,before1,now1,1,ic1) call ctrig(n2,trig2,after2,before2,now2,1,ic2) !$omp do do 12345,idat=1,ndat ! transform along z axis ! input: I1,I3,J2,(Jp2) lot=ncache/(4*n3) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 3333,j2=1,(md2/nproc) ! if (iproc*(md2/nproc)+j2.le.n2/2) then do 3000,i1=1,(n1/2),lot ma=i1 mb=min(i1+(lot-1),(n1/2)) nfft=mb-ma+1 ! input: I1,I3,J2,(Jp2) call fill_corn(md1,md3,lot,nfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) inzee=1 do i=1,ic3 call fftstp(lot,nfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),1) inzee=3-inzee enddo ! input: I1,i3,J2,(Jp2) call scramble(i1,j2,lot,nfft,n1/2,n3,md2,nproc,nd3,zw(1,1,inzee),zmpi2) ! output: I1,J2,i3,(Jp2) 3000 continue ! endif 3333 continue ! Interprocessor data transposition ! input: I1,J2,j3,jp3,(Jp2) if (nproc.gt.1) then 11 continue !$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11 call MPI_ALLTOALL(zmpi2,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision, & zmpi1,n1*(md2/nproc)*(nd3/nproc), & MPI_double_precision,MPI_COMM_WORLD,ierr) lock=lock+1 !$omp flush(lock) ! output: I1,J2,j3,Jp2,(jp3) endif do 1212,j3=1,nd3/nproc if (iproc*(nd3/nproc)+j3.le.n3) then Jp2st=1 J2st=1 ! transform along x axis lot=ncache/(4*n1) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 1000,j=1,n2/2,lot ma=j mb=min(j+(lot-1),n2/2) nfft=mb-ma+1 ! input: I1,J2,j3,Jp2,(jp3) if (nproc.eq.1) then call mpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi2,zw(1,1,1)) else call mpiswitch_corn(j3,nfft,Jp2st,J2st,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi1,zw(1,1,1)) endif ! output: J2,Jp2,I1,j3,(jp3) ! input: I2,I1,j3,(jp3) inzee=1 do i=1,ic1-1 call fftstp(lot,nfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),1) inzee=3-inzee enddo i=ic1 call fftstp(lot,nfft,n1,lzt,n1,zw(1,1,inzee),zt(1,j,1), & trig1,after1(i),now1(i),before1(i),1) ! output: I2,i1,j3,(jp3) 1000 continue ! transform along y axis lot=ncache/(4*n2) if (lot.lt.1) then write(6,*) & 'encache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of this size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif do 2000,j=1,n1,lot ma=j mb=min(j+(lot-1),n1) nfft=mb-ma+1 ! input: I2,i1,j3,(jp3) call switch_corn(nfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) ! output: i1,I2,j3,(jp3) inzee=1 do i=1,ic2 call fftstp(lot,nfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),1) inzee=3-inzee enddo ! Densities per thread if (npr.eq.1) then call addrho(nd1,nd2,n2,lot,nfft,zw(1,1,inzee),rho(j,1,j3)) else call addrho(nd1,nd2,n2,lot,nfft,zw(1,1,inzee),rhopart(j,1,j3)) endif 2000 continue ! output: i1,i2,j3,(jp3) endif 1212 continue 12345 continue !$omp enddo !$omp critical ! Sum total density from the partial densities per thread if (npr.gt.1) then do 644, i3=1,nd3/nproc j3=iproc*(nd3/nproc)+i3 if (j3.le.n3) then do 643, i2=1,n2 do 643, i1=1,n1 rho(i1,i2,i3)=rho(i1,i2,i3)+rhopart(i1,i2,i3) 643 continue endif 644 continue endif !$omp end critical deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & buf1,zmpi2,buf2,zw,buf3,zt) if (nproc.gt.1) deallocate(zmpi1) if (npr.gt.1) deallocate(rhopart) !$omp end parallel return end subroutine addrho(nd1,nd2,n2,lot,nfft,zw,rhopart) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),rhopart(nd1,nd2) ! n2 should be multiple of 2 if (2*(n2/2).ne.n2) stop 'addrho' do 10, i2=1,n2-1,2 do 10, j=1,nfft rhopart(j,i2+0)=rhopart(j,i2+0)+zw(1,j,i2+0)**2+zw(2,j,i2+0)**2 rhopart(j,i2+1)=rhopart(j,i2+1)+zw(1,j,i2+1)**2+zw(2,j,i2+1)**2 10 continue return end subroutine switch_corn(nfft,n2,lot,n1,lzt,zt,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),zt(2,lzt,n1) ! WARNING: Assuming that high frequencies are in the corners ! and that n2 is multiple of 4 ! Low frequencies do 100,j=1,nfft do 100,i=n2/4+1,3*n2/4 zw(1,j,i)=zt(1,i-n2/4,j) zw(2,j,i)=zt(2,i-n2/4,j) 100 continue ! High frequencies do 90,i=1,n2/4 do 90,j=1,nfft zw(1,j,i)=0.d0 zw(2,j,i)=0.d0 90 continue do 110,i=3*n2/4+1,n2 do 110,j=1,nfft zw(1,j,i)=0.d0 zw(2,j,i)=0.d0 110 continue return end subroutine mpiswitch_corn(j3,nfft,Jp2stb,J2stb,lot,n1,n2,md2,nd3,nproc,keypdist,zmpi1,zw) implicit real*8 (a-h,o-z) dimension zmpi1(2,n1/2,md2/nproc,nd3/nproc,nproc),zw(2,lot,n1),keypdist(2,n2/2) ! WARNING: Assuming that high frequencies are in the corners ! and that n1 is multiple of 4 mfft=0 do 300,Jp2=Jp2stb,nproc do 200,J2=J2stb,md2/nproc mfft=mfft+1 if (mfft.gt.nfft) then Jp2stb=Jp2 J2stb=J2 return endif do 90,I1=1,n1/4 zw(1,mfft,I1)=0.d0 zw(2,mfft,I1)=0.d0 90 continue i2=(Jp2-1)*(md2/nproc)+J2 kp2=keypdist(1,i2) k2=keypdist(2,i2) ! write(6,*) 'mpiswitch', jp2,j2,i2,kp2,k2 ! if (jp2.ne.kp2 .or. j2.ne.k2) stop 'mpiswitch' do 100,I1=n1/4+1,3*n1/4 zw(1,mfft,I1)=zmpi1(1,I1-n1/4,k2,j3,kp2) zw(2,mfft,I1)=zmpi1(2,I1-n1/4,k2,j3,kp2) 100 continue do 110,I1=3*n1/4+1,n1 zw(1,mfft,I1)=0.d0 zw(2,mfft,I1)=0.d0 110 continue 200 continue J2stb=1 300 continue end subroutine fill_corn(md1,md3,lot,nfft,n3,zf,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zf(2,md1,md3) ! WARNING: Assuming that high frequencies are in the corners ! and that n3 is multiple of 4 do 90,i3=1,n3/4 do i1=1,nfft zw(1,i1,i3)=0.d0 zw(2,i1,i3)=0.d0 enddo 90 continue do 100,i3=n3/4+1,3*n3/4 do i1=1,nfft zw(1,i1,i3)=zf(1,i1,i3-n3/4) zw(2,i1,i3)=zf(2,i1,i3-n3/4) enddo 100 continue do 110,i3=3*n3/4+1,n3 do i1=1,nfft zw(1,i1,i3)=0.d0 zw(2,i1,i3)=0.d0 enddo 110 continue return end subroutine scramble(i1,j2,lot,nfft,n1,n3,md2,nproc,nd3,zw,zmpi2) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zmpi2(2,n1,md2/nproc,nd3) do 100,i3=1,n3 do 100,i=0,nfft-1 zmpi2(1,i1+i,j2,i3)=zw(1,i+1,i3) zmpi2(2,i1+i,j2,i3)=zw(2,i+1,i3) 100 continue return end subroutine unswitch_corn(nfft,n2,lot,n1,lzt,zw,zt) implicit real*8 (a-h,o-z) dimension zw(2,lot,n2),zt(2,lzt,n1) ! WARNING: Assuming that high frequencies are in the corners ! and that n2 is multiple of 4 ! Low frequencies do 100,j=1,nfft do 100,i=n2/4+1,3*n2/4 zt(1,i-n2/4,j)=zw(1,j,i) zt(2,i-n2/4,j)=zw(2,j,i) 100 continue return end subroutine unmpiswitch_corn(j3,nfft,Jp2stf,J2stf,lot,n1,n2,md2,nd3,nproc,keypdist,zw,zmpi1) implicit real*8 (a-h,o-z) dimension zmpi1(2,n1/2,md2/nproc,nd3/nproc,nproc),zw(2,lot,n1),keypdist(2,n2/2) ! WARNING: Assuming that high frequencies are in the corners ! and that n1 is multiple of 4 mfft=0 do 300,Jp2=Jp2stf,nproc do 200,J2=J2stf,md2/nproc mfft=mfft+1 if (mfft.gt.nfft) then Jp2stf=Jp2 J2stf=J2 return endif i2=(Jp2-1)*(md2/nproc)+J2 kp2=keypdist(1,i2) k2=keypdist(2,i2) ! write(6,*) 'unmpiswitch', jp2,j2,i2,kp2,k2 ! if (jp2.ne.kp2 .or. j2.ne.k2) stop 'unmpiswitch' do 100,I1=n1/4+1,3*n1/4 zmpi1(1,I1-n1/4,k2,j3,kp2)=zw(1,mfft,I1) zmpi1(2,I1-n1/4,k2,j3,kp2)=zw(2,mfft,I1) 100 continue 200 continue J2stf=1 300 continue end subroutine unscramble(i1,j2,lot,nfft,n1,n3,md2,nproc,nd3,zmpi2,zw) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zmpi2(2,n1,md2/nproc,nd3) do 100,i3=1,n3 do 100,i=0,nfft-1 zw(1,i+1,i3)=zmpi2(1,i1+i,j2,i3) zw(2,i+1,i3)=zmpi2(2,i1+i,j2,i3) 100 continue return end subroutine unfill_corn(md1,md3,lot,nfft,n3,zw,zf) implicit real*8 (a-h,o-z) dimension zw(2,lot,n3),zf(2,md1,md3) ! WARNING: Assuming that high frequencies are in the corners ! and that n3 is multiple of 4 do 100,i3=n3/4+1,3*n3/4 do i1=1,nfft zf(1,i1,i3-n3/4)=zw(1,i1,i3) zf(2,i1,i3-n3/4)=zw(2,i1,i3) enddo 100 continue return end subroutine ctrig(n,trig,after,before,now,isign,ic) ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . ! Different factorizations can affect the performance ! Factoring 64 as 4*4*4 might for example be faster on some machines than 8*8. implicit real*8 (a-h,o-z) integer after,before dimension now(7),after(7),before(7),trig(2,2048) INTEGER, DIMENSION(7,104) :: idata ! The factor 6 is only allowed in the first place! data ((idata(i,j),i=1,7),j=1,104) / & 3, 3, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, & 5, 5, 1, 1, 1, 1, 1, 6, 6, 1, 1, 1, 1, 1, & 8, 8, 1, 1, 1, 1, 1, 9, 3, 3, 1, 1, 1, 1, & 10, 5, 2, 1, 1, 1, 1, & 12, 4, 3, 1, 1, 1, 1, 15, 5, 3, 1, 1, 1, 1, & 16, 4, 4, 1, 1, 1, 1, 18, 6, 3, 1, 1, 1, 1, & 20, 5, 4, 1, 1, 1, 1, 24, 8, 3, 1, 1, 1, 1, & 25, 5, 5, 1, 1, 1, 1, 27, 3, 3, 3, 1, 1, 1, & 30, 6, 5, 1, 1, 1, 1, 32, 8, 4, 1, 1, 1, 1, & 36, 4, 3, 3, 1, 1, 1, 40, 8, 5, 1, 1, 1, 1, & 45, 5, 3, 3, 1, 1, 1, 48, 4, 4, 3, 1, 1, 1, & 50, 5, 5, 2, 1, 1, 1, & 54, 6, 3, 3, 1, 1, 1, 60, 5, 4, 3, 1, 1, 1, & 64, 8, 8, 1, 1, 1, 1, 72, 8, 3, 3, 1, 1, 1, & 75, 5, 5, 3, 1, 1, 1, 80, 5, 4, 4, 1, 1, 1, & 81, 3, 3, 3, 3, 1, 1, 90, 6, 5, 3, 1, 1, 1, & 96, 8, 4, 3, 1, 1, 1, 100, 5, 5, 4, 1, 1, 1, & 108, 4, 3, 3, 3, 1, 1, 120, 8, 5, 3, 1, 1, 1, & 125, 5, 5, 5, 1, 1, 1, 128, 8, 4, 4, 1, 1, 1, & 135, 5, 3, 3, 3, 1, 1, 144, 6, 8, 3, 1, 1, 1, & 150, 6, 5, 5, 1, 1, 1, 160, 8, 5, 4, 1, 1, 1, & 162, 6, 3, 3, 3, 1, 1, 180, 5, 4, 3, 3, 1, 1, & 192, 6, 8, 4, 1, 1, 1, 200, 8, 5, 5, 1, 1, 1, & 216, 8, 3, 3, 3, 1, 1, 225, 5, 5, 3, 3, 1, 1, & 240, 6, 8, 5, 1, 1, 1, 243, 3, 3, 3, 3, 3, 1, & 250, 5, 5, 5, 2, 1, 1, & 256, 8, 8, 4, 1, 1, 1, 270, 6, 5, 3, 3, 1, 1, & 288, 8, 4, 3, 3, 1, 1, 300, 5, 5, 4, 3, 1, 1, & 320, 5, 4, 4, 4, 1, 1, 324, 4, 3, 3, 3, 3, 1, & 360, 8, 5, 3, 3, 1, 1, 375, 5, 5, 5, 3, 1, 1, & 384, 8, 4, 4, 3, 1, 1, 400, 5, 5, 4, 4, 1, 1, & 405, 5, 3, 3, 3, 3, 1, 432, 4, 4, 3, 3, 3, 1, & 450, 6, 5, 5, 3, 1, 1, 480, 8, 5, 4, 3, 1, 1, & 486, 6, 3, 3, 3, 3, 1, 500, 5, 5, 5, 4, 1, 1, & 512, 8, 8, 8, 1, 1, 1, 540, 5, 4, 3, 3, 3, 1, & 576, 4, 4, 4, 3, 3, 1, 600, 8, 5, 5, 3, 1, 1, & 625, 5, 5, 5, 5, 1, 1, 640, 8, 5, 4, 4, 1, 1, & 648, 8, 3, 3, 3, 3, 1, 675, 5, 5, 3, 3, 3, 1, & 720, 5, 4, 4, 3, 3, 1, 729, 3, 3, 3, 3, 3, 3, & 750, 6, 5, 5, 5, 1, 1, 768, 4, 4, 4, 4, 3, 1, & 800, 8, 5, 5, 4, 1, 1, 810, 6, 5, 3, 3, 3, 1, & 864, 8, 4, 3, 3, 3, 1, 900, 5, 5, 4, 3, 3, 1, & 960, 5, 4, 4, 4, 3, 1, 972, 4, 3, 3, 3, 3, 3, & 1000, 8, 5, 5, 5, 1, 1, 1024, 4, 4, 4, 4, 4, 1, & 1080, 8, 5, 3, 3, 3, 1, & 1152, 8, 4, 4, 3, 3, 1, 1200, 5, 5, 4, 4, 3, 1, & 1250, 5, 5, 5, 5, 2, 1, 1280, 5, 4, 4, 4, 4, 1, & 1296, 4, 4, 3, 3, 3, 3, 1350, 6, 5, 5, 3, 3, 1, & 1440, 6, 5, 4, 4, 3, 1, 1458, 6, 3, 3, 3, 3, 3, & 1500, 5, 5, 5, 4, 3, 1, 1536, 8, 4, 4, 4, 3, 1, & 1600, 8, 8, 5, 5, 1, 1, 1620, 5, 4, 3, 3, 3, 3, & 1728, 8, 8, 3, 3, 3, 1, 1800, 8, 5, 5, 3, 3, 1, & 1920, 8, 5, 4, 4, 3, 1, 1944, 8, 3, 3, 3, 3, 3, & 2000, 5, 5, 5, 4, 4, 1, 2048, 8, 4, 4, 4, 4, 1 / do 111,i=1,104 if (n.eq.idata(1,i)) then ic=0 do 11,j=1,6 itt=idata(1+j,i) if (itt.gt.1) then ic=ic+1 now(j)=idata(1+j,i) else goto 1000 endif 11 continue goto 1000 endif 111 continue print*,'VALUE OF',n,'NOT ALLOWED FOR FFT, ALLOWED VALUES ARE:' 37 format(15(i5)) write(6,37) (idata(1,i),i=1,104) stop 1000 continue after(1)=1 before(ic)=1 do 22,i=2,ic after(i)=after(i-1)*now(i-1) 22 before(ic-i+1)=before(ic-i+2)*now(ic-i+2) 12 format(6(i3)) ! write(6,12) (after(i),i=1,ic) ! write(6,12) (now(i),i=1,ic) ! write(6,12) (before(i),i=1,ic) twopi=6.283185307179586d0 angle=isign*twopi/n if (mod(n,2).eq.0) then nh=n/2 trig(1,1)=1.d0 trig(2,1)=0.d0 trig(1,nh+1)=-1.d0 trig(2,nh+1)=0.d0 do 40,i=1,nh-1 trigc=cos(i*angle) trigs=sin(i*angle) trig(1,i+1)=trigc trig(2,i+1)=trigs trig(1,n-i+1)=trigc trig(2,n-i+1)=-trigs 40 continue else nh=(n-1)/2 trig(1,1)=1.d0 trig(2,1)=0.d0 do 20,i=1,nh trigc=cos(i*angle) trigs=sin(i*angle) trig(1,i+1)=trigc trig(2,i+1)=trigs trig(1,n-i+1)=trigc trig(2,n-i+1)=-trigs 20 continue endif return end subroutine fftstp(mm,nfft,m,nn,n,zin,zout,trig,after,now,before,isign) ! RESTRICTIONS on USAGE ! Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994 ! Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999 ! Copyright (C) 2002 Stefan Goedecker, CEA Grenoble ! This file is distributed under the terms of the ! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z) integer after,before,atn,atb dimension trig(2,2048),zin(2,mm,m),zout(2,nn,n) atn=after*now atb=after*before ! sqrt(.5d0) rt2i=0.7071067811865475d0 if (now.eq.2) then ia=1 nin1=ia-after nout1=ia-atn do 2001,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2001,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s1 - s2 2001 continue do 2000,ia=2,after ias=ia-1 if (2*ias.eq.after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 2010,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2010,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(2,j,nin2) s2=zin(1,j,nin2) zout(1,j,nout1)= r1 - r2 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r2 + r1 zout(2,j,nout2)= s1 - s2 2010 continue else nin1=ia-after nout1=ia-atn do 2020,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2020,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(2,j,nin2) s2=zin(1,j,nin2) zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s1 - s2 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s2 + s1 2020 continue endif else if (4*ias.eq.after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 2030,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2030,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r - s)*rt2i s2=(r + s)*rt2i zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s1 - s2 2030 continue else nin1=ia-after nout1=ia-atn do 2040,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2040,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r + s)*rt2i s2=(s - r)*rt2i zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s1 - s2 2040 continue endif else if (4*ias.eq.3*after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 2050,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2050,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r + s)*rt2i s2=(r - s)*rt2i zout(1,j,nout1)= r1 - r2 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r2 + r1 zout(2,j,nout2)= s1 - s2 2050 continue else nin1=ia-after nout1=ia-atn do 2060,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2060,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(s - r)*rt2i s2=(r + s)*rt2i zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s1 - s2 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s2 + s1 2060 continue endif else itrig=ias*before+1 cr2=trig(1,itrig) ci2=trig(2,itrig) nin1=ia-after nout1=ia-atn do 2090,ib=1,before nin1=nin1+after nin2=nin1+atb nout1=nout1+atn nout2=nout1+after do 2090,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 zout(1,j,nout1)= r2 + r1 zout(2,j,nout1)= s2 + s1 zout(1,j,nout2)= r1 - r2 zout(2,j,nout2)= s1 - s2 2090 continue endif 2000 continue else if (now.eq.4) then if (isign.eq.1) then ia=1 nin1=ia-after nout1=ia-atn do 4001,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4001,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r4=zin(1,j,nin4) s4=zin(2,j,nin4) r=r1 + r3 s=r2 + r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 - r3 s=s2 - s4 zout(1,j,nout2) = r - s zout(1,j,nout4) = r + s r=s1 + s3 s=s2 + s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 - s3 s=r2 - r4 zout(2,j,nout2) = r + s zout(2,j,nout4) = r - s 4001 continue do 4000,ia=2,after ias=ia-1 if (2*ias.eq.after) then nin1=ia-after nout1=ia-atn do 4010,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4010,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r-s)*rt2i s2=(r+s)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=zin(1,j,nin4) s=zin(2,j,nin4) r4=(r + s)*rt2i s4=(r - s)*rt2i r=r1 - r3 s=r2 - r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 + r3 s=s2 - s4 zout(1,j,nout2) = r - s zout(1,j,nout4) = r + s r=s1 + s3 s=s2 + s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 - s3 s=r2 + r4 zout(2,j,nout2) = r + s zout(2,j,nout4) = r - s 4010 continue else itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) itrig=itrig+itt cr4=trig(1,itrig) ci4=trig(2,itrig) nin1=ia-after nout1=ia-atn do 4020,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4020,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=zin(1,j,nin4) s=zin(2,j,nin4) r4=r*cr4 - s*ci4 s4=r*ci4 + s*cr4 r=r1 + r3 s=r2 + r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 - r3 s=s2 - s4 zout(1,j,nout2) = r - s zout(1,j,nout4) = r + s r=s1 + s3 s=s2 + s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 - s3 s=r2 - r4 zout(2,j,nout2) = r + s zout(2,j,nout4) = r - s 4020 continue endif 4000 continue else ia=1 nin1=ia-after nout1=ia-atn do 4101,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4101,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r4=zin(1,j,nin4) s4=zin(2,j,nin4) r=r1 + r3 s=r2 + r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 - r3 s=s2 - s4 zout(1,j,nout2) = r + s zout(1,j,nout4) = r - s r=s1 + s3 s=s2 + s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 - s3 s=r2 - r4 zout(2,j,nout2) = r - s zout(2,j,nout4) = r + s 4101 continue do 4100,ia=2,after ias=ia-1 if (2*ias.eq.after) then nin1=ia-after nout1=ia-atn do 4110,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4110,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r + s)*rt2i s2=(s - r)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=zin(1,j,nin4) s=zin(2,j,nin4) r4=(s - r)*rt2i s4=(r + s)*rt2i r=r1 + r3 s=r2 + r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 - r3 s=s2 + s4 zout(1,j,nout2) = r + s zout(1,j,nout4) = r - s r=s1 - s3 s=s2 - s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 + s3 s=r2 - r4 zout(2,j,nout2) = r - s zout(2,j,nout4) = r + s 4110 continue else itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) itrig=itrig+itt cr4=trig(1,itrig) ci4=trig(2,itrig) nin1=ia-after nout1=ia-atn do 4120,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after do 4120,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=zin(1,j,nin4) s=zin(2,j,nin4) r4=r*cr4 - s*ci4 s4=r*ci4 + s*cr4 r=r1 + r3 s=r2 + r4 zout(1,j,nout1) = r + s zout(1,j,nout3) = r - s r=r1 - r3 s=s2 - s4 zout(1,j,nout2) = r + s zout(1,j,nout4) = r - s r=s1 + s3 s=s2 + s4 zout(2,j,nout1) = r + s zout(2,j,nout3) = r - s r=s1 - s3 s=r2 - r4 zout(2,j,nout2) = r - s zout(2,j,nout4) = r + s 4120 continue endif 4100 continue endif else if (now.eq.8) then if (isign.eq.-1) then ia=1 nin1=ia-after nout1=ia-atn do 8120,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nin6=nin5+atb nin7=nin6+atb nin8=nin7+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after nout6=nout5+after nout7=nout6+after nout8=nout7+after do 8120,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r4=zin(1,j,nin4) s4=zin(2,j,nin4) r5=zin(1,j,nin5) s5=zin(2,j,nin5) r6=zin(1,j,nin6) s6=zin(2,j,nin6) r7=zin(1,j,nin7) s7=zin(2,j,nin7) r8=zin(1,j,nin8) s8=zin(2,j,nin8) r=r1 + r5 s=r3 + r7 ap=r + s am=r - s r=r2 + r6 s=r4 + r8 bp=r + s bm=r - s r=s1 + s5 s=s3 + s7 cp=r + s cm=r - s r=s2 + s6 s=s4 + s8 dp=r + s dm=r - s zout(1,j,nout1) = ap + bp zout(2,j,nout1) = cp + dp zout(1,j,nout5) = ap - bp zout(2,j,nout5) = cp - dp zout(1,j,nout3) = am + dm zout(2,j,nout3) = cm - bm zout(1,j,nout7) = am - dm zout(2,j,nout7) = cm + bm r=r1 - r5 s=s3 - s7 ap=r + s am=r - s r=s1 - s5 s=r3 - r7 bp=r + s bm=r - s r=s4 - s8 s=r2 - r6 cp=r + s cm=r - s r=s2 - s6 s=r4 - r8 dp=r + s dm=r - s r = ( cp + dm)*rt2i s = ( dm - cp)*rt2i cp= ( cm + dp)*rt2i dp = ( cm - dp)*rt2i zout(1,j,nout2) = ap + r zout(2,j,nout2) = bm + s zout(1,j,nout6) = ap - r zout(2,j,nout6) = bm - s zout(1,j,nout4) = am + cp zout(2,j,nout4) = bp + dp zout(1,j,nout8) = am - cp zout(2,j,nout8) = bp - dp 8120 continue do 8000,ia=2,after ias=ia-1 itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) itrig=itrig+itt cr4=trig(1,itrig) ci4=trig(2,itrig) itrig=itrig+itt cr5=trig(1,itrig) ci5=trig(2,itrig) itrig=itrig+itt cr6=trig(1,itrig) ci6=trig(2,itrig) itrig=itrig+itt cr7=trig(1,itrig) ci7=trig(2,itrig) itrig=itrig+itt cr8=trig(1,itrig) ci8=trig(2,itrig) nin1=ia-after nout1=ia-atn do 8020,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nin6=nin5+atb nin7=nin6+atb nin8=nin7+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after nout6=nout5+after nout7=nout6+after nout8=nout7+after do 8020,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=zin(1,j,nin4) s=zin(2,j,nin4) r4=r*cr4 - s*ci4 s4=r*ci4 + s*cr4 r=zin(1,j,nin5) s=zin(2,j,nin5) r5=r*cr5 - s*ci5 s5=r*ci5 + s*cr5 r=zin(1,j,nin6) s=zin(2,j,nin6) r6=r*cr6 - s*ci6 s6=r*ci6 + s*cr6 r=zin(1,j,nin7) s=zin(2,j,nin7) r7=r*cr7 - s*ci7 s7=r*ci7 + s*cr7 r=zin(1,j,nin8) s=zin(2,j,nin8) r8=r*cr8 - s*ci8 s8=r*ci8 + s*cr8 r=r1 + r5 s=r3 + r7 ap=r + s am=r - s r=r2 + r6 s=r4 + r8 bp=r + s bm=r - s r=s1 + s5 s=s3 + s7 cp=r + s cm=r - s r=s2 + s6 s=s4 + s8 dp=r + s dm=r - s zout(1,j,nout1) = ap + bp zout(2,j,nout1) = cp + dp zout(1,j,nout5) = ap - bp zout(2,j,nout5) = cp - dp zout(1,j,nout3) = am + dm zout(2,j,nout3) = cm - bm zout(1,j,nout7) = am - dm zout(2,j,nout7) = cm + bm r=r1 - r5 s=s3 - s7 ap=r + s am=r - s r=s1 - s5 s=r3 - r7 bp=r + s bm=r - s r=s4 - s8 s=r2 - r6 cp=r + s cm=r - s r=s2 - s6 s=r4 - r8 dp=r + s dm=r - s r = ( cp + dm)*rt2i s = ( dm - cp)*rt2i cp= ( cm + dp)*rt2i dp = ( cm - dp)*rt2i zout(1,j,nout2) = ap + r zout(2,j,nout2) = bm + s zout(1,j,nout6) = ap - r zout(2,j,nout6) = bm - s zout(1,j,nout4) = am + cp zout(2,j,nout4) = bp + dp zout(1,j,nout8) = am - cp zout(2,j,nout8) = bp - dp 8020 continue 8000 continue else ia=1 nin1=ia-after nout1=ia-atn do 8121,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nin6=nin5+atb nin7=nin6+atb nin8=nin7+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after nout6=nout5+after nout7=nout6+after nout8=nout7+after do 8121,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r4=zin(1,j,nin4) s4=zin(2,j,nin4) r5=zin(1,j,nin5) s5=zin(2,j,nin5) r6=zin(1,j,nin6) s6=zin(2,j,nin6) r7=zin(1,j,nin7) s7=zin(2,j,nin7) r8=zin(1,j,nin8) s8=zin(2,j,nin8) r=r1 + r5 s=r3 + r7 ap=r + s am=r - s r=r2 + r6 s=r4 + r8 bp=r + s bm=r - s r=s1 + s5 s=s3 + s7 cp=r + s cm=r - s r=s2 + s6 s=s4 + s8 dp=r + s dm=r - s zout(1,j,nout1) = ap + bp zout(2,j,nout1) = cp + dp zout(1,j,nout5) = ap - bp zout(2,j,nout5) = cp - dp zout(1,j,nout3) = am - dm zout(2,j,nout3) = cm + bm zout(1,j,nout7) = am + dm zout(2,j,nout7) = cm - bm r= r1 - r5 s=-s3 + s7 ap=r + s am=r - s r=s1 - s5 s=r7 - r3 bp=r + s bm=r - s r=-s4 + s8 s= r2 - r6 cp=r + s cm=r - s r=-s2 + s6 s= r4 - r8 dp=r + s dm=r - s r = ( cp + dm)*rt2i s = ( cp - dm)*rt2i cp= ( cm + dp)*rt2i dp= ( dp - cm)*rt2i zout(1,j,nout2) = ap + r zout(2,j,nout2) = bm + s zout(1,j,nout6) = ap - r zout(2,j,nout6) = bm - s zout(1,j,nout4) = am + cp zout(2,j,nout4) = bp + dp zout(1,j,nout8) = am - cp zout(2,j,nout8) = bp - dp 8121 continue do 8001,ia=2,after ias=ia-1 itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) itrig=itrig+itt cr4=trig(1,itrig) ci4=trig(2,itrig) itrig=itrig+itt cr5=trig(1,itrig) ci5=trig(2,itrig) itrig=itrig+itt cr6=trig(1,itrig) ci6=trig(2,itrig) itrig=itrig+itt cr7=trig(1,itrig) ci7=trig(2,itrig) itrig=itrig+itt cr8=trig(1,itrig) ci8=trig(2,itrig) nin1=ia-after nout1=ia-atn do 8021,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nin6=nin5+atb nin7=nin6+atb nin8=nin7+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after nout6=nout5+after nout7=nout6+after nout8=nout7+after do 8021,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=zin(1,j,nin4) s=zin(2,j,nin4) r4=r*cr4 - s*ci4 s4=r*ci4 + s*cr4 r=zin(1,j,nin5) s=zin(2,j,nin5) r5=r*cr5 - s*ci5 s5=r*ci5 + s*cr5 r=zin(1,j,nin6) s=zin(2,j,nin6) r6=r*cr6 - s*ci6 s6=r*ci6 + s*cr6 r=zin(1,j,nin7) s=zin(2,j,nin7) r7=r*cr7 - s*ci7 s7=r*ci7 + s*cr7 r=zin(1,j,nin8) s=zin(2,j,nin8) r8=r*cr8 - s*ci8 s8=r*ci8 + s*cr8 r=r1 + r5 s=r3 + r7 ap=r + s am=r - s r=r2 + r6 s=r4 + r8 bp=r + s bm=r - s r=s1 + s5 s=s3 + s7 cp=r + s cm=r - s r=s2 + s6 s=s4 + s8 dp=r + s dm=r - s zout(1,j,nout1) = ap + bp zout(2,j,nout1) = cp + dp zout(1,j,nout5) = ap - bp zout(2,j,nout5) = cp - dp zout(1,j,nout3) = am - dm zout(2,j,nout3) = cm + bm zout(1,j,nout7) = am + dm zout(2,j,nout7) = cm - bm r= r1 - r5 s=-s3 + s7 ap=r + s am=r - s r=s1 - s5 s=r7 - r3 bp=r + s bm=r - s r=-s4 + s8 s= r2 - r6 cp=r + s cm=r - s r=-s2 + s6 s= r4 - r8 dp=r + s dm=r - s r = ( cp + dm)*rt2i s = ( cp - dm)*rt2i cp= ( cm + dp)*rt2i dp= ( dp - cm)*rt2i zout(1,j,nout2) = ap + r zout(2,j,nout2) = bm + s zout(1,j,nout6) = ap - r zout(2,j,nout6) = bm - s zout(1,j,nout4) = am + cp zout(2,j,nout4) = bp + dp zout(1,j,nout8) = am - cp zout(2,j,nout8) = bp - dp 8021 continue 8001 continue endif else if (now.eq.3) then ! .5d0*sqrt(3.d0) bb=isign*0.8660254037844387d0 ia=1 nin1=ia-after nout1=ia-atn do 3001,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3001,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r=r2 + r3 s=s2 + s3 zout(1,j,nout1) = r + r1 zout(2,j,nout1) = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r2=bb*(r2-r3) s2=bb*(s2-s3) zout(1,j,nout2) = r1 - s2 zout(2,j,nout2) = s1 + r2 zout(1,j,nout3) = r1 + s2 zout(2,j,nout3) = s1 - r2 3001 continue do 3000,ia=2,after ias=ia-1 if (4*ias.eq.3*after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 3010,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3010,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(2,j,nin2) s2=zin(1,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r=r3 + r2 s=s2 - s3 zout(1,j,nout1) = r1 - r zout(2,j,nout1) = s + s1 r1=r1 + .5d0*r s1=s1 - .5d0*s r2=bb*(r2-r3) s2=bb*(s2+s3) zout(1,j,nout2) = r1 - s2 zout(2,j,nout2) = s1 - r2 zout(1,j,nout3) = r1 + s2 zout(2,j,nout3) = s1 + r2 3010 continue else nin1=ia-after nout1=ia-atn do 3020,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3020,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(2,j,nin2) s2=zin(1,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r=r2 - r3 s=s2 + s3 zout(1,j,nout1) = r + r1 zout(2,j,nout1) = s1 - s r1=r1 - .5d0*r s1=s1 + .5d0*s r2=bb*(r2+r3) s2=bb*(s2-s3) zout(1,j,nout2) = r1 + s2 zout(2,j,nout2) = s1 + r2 zout(1,j,nout3) = r1 - s2 zout(2,j,nout3) = s1 - r2 3020 continue endif else if (8*ias.eq.3*after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 3030,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3030,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r - s)*rt2i s2=(r + s)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=r2 - r3 s=s2 + s3 zout(1,j,nout1) = r + r1 zout(2,j,nout1) = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r2=bb*(r2+r3) s2=bb*(s2-s3) zout(1,j,nout2) = r1 - s2 zout(2,j,nout2) = s1 + r2 zout(1,j,nout3) = r1 + s2 zout(2,j,nout3) = s1 - r2 3030 continue else nin1=ia-after nout1=ia-atn do 3040,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3040,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r + s)*rt2i s2=(s - r)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=r2 + r3 s=s2 - s3 zout(1,j,nout1) = r + r1 zout(2,j,nout1) = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r2=bb*(r2-r3) s2=bb*(s2+s3) zout(1,j,nout2) = r1 - s2 zout(2,j,nout2) = s1 + r2 zout(1,j,nout3) = r1 + s2 zout(2,j,nout3) = s1 - r2 3040 continue endif else itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) nin1=ia-after nout1=ia-atn do 3090,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after do 3090,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=r2 + r3 s=s2 + s3 zout(1,j,nout1) = r + r1 zout(2,j,nout1) = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r2=bb*(r2-r3) s2=bb*(s2-s3) zout(1,j,nout2) = r1 - s2 zout(2,j,nout2) = s1 + r2 zout(1,j,nout3) = r1 + s2 zout(2,j,nout3) = s1 - r2 3090 continue endif 3000 continue else if (now.eq.5) then ! cos(2.d0*pi/5.d0) cos2=0.3090169943749474d0 ! cos(4.d0*pi/5.d0) cos4=-0.8090169943749474d0 ! sin(2.d0*pi/5.d0) sin2=isign*0.9510565162951536d0 ! sin(4.d0*pi/5.d0) sin4=isign*0.5877852522924731d0 ia=1 nin1=ia-after nout1=ia-atn do 5001,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after do 5001,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r2=zin(1,j,nin2) s2=zin(2,j,nin2) r3=zin(1,j,nin3) s3=zin(2,j,nin3) r4=zin(1,j,nin4) s4=zin(2,j,nin4) r5=zin(1,j,nin5) s5=zin(2,j,nin5) r25 = r2 + r5 r34 = r3 + r4 s25 = s2 - s5 s34 = s3 - s4 zout(1,j,nout1) = r1 + r25 + r34 r = r1 + cos2*r25 + cos4*r34 s = sin2*s25 + sin4*s34 zout(1,j,nout2) = r - s zout(1,j,nout5) = r + s r = r1 + cos4*r25 + cos2*r34 s = sin4*s25 - sin2*s34 zout(1,j,nout3) = r - s zout(1,j,nout4) = r + s r25 = r2 - r5 r34 = r3 - r4 s25 = s2 + s5 s34 = s3 + s4 zout(2,j,nout1) = s1 + s25 + s34 r = s1 + cos2*s25 + cos4*s34 s = sin2*r25 + sin4*r34 zout(2,j,nout2) = r + s zout(2,j,nout5) = r - s r = s1 + cos4*s25 + cos2*s34 s = sin4*r25 - sin2*r34 zout(2,j,nout3) = r + s zout(2,j,nout4) = r - s 5001 continue do 5000,ia=2,after ias=ia-1 if (8*ias.eq.5*after) then if (isign.eq.1) then nin1=ia-after nout1=ia-atn do 5010,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after do 5010,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r - s)*rt2i s2=(r + s)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=zin(1,j,nin4) s=zin(2,j,nin4) r4=(r + s)*rt2i s4=(r - s)*rt2i r5=zin(1,j,nin5) s5=zin(2,j,nin5) r25 = r2 - r5 r34 = r3 + r4 s25 = s2 + s5 s34 = s3 - s4 zout(1,j,nout1) = r1 + r25 - r34 r = r1 + cos2*r25 - cos4*r34 s = sin2*s25 + sin4*s34 zout(1,j,nout2) = r - s zout(1,j,nout5) = r + s r = r1 + cos4*r25 - cos2*r34 s = sin4*s25 - sin2*s34 zout(1,j,nout3) = r - s zout(1,j,nout4) = r + s r25 = r2 + r5 r34 = r4 - r3 s25 = s2 - s5 s34 = s3 + s4 zout(2,j,nout1) = s1 + s25 + s34 r = s1 + cos2*s25 + cos4*s34 s = sin2*r25 + sin4*r34 zout(2,j,nout2) = r + s zout(2,j,nout5) = r - s r = s1 + cos4*s25 + cos2*s34 s = sin4*r25 - sin2*r34 zout(2,j,nout3) = r + s zout(2,j,nout4) = r - s 5010 continue else nin1=ia-after nout1=ia-atn do 5020,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after do 5020,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=(r + s)*rt2i s2=(s - r)*rt2i r3=zin(2,j,nin3) s3=zin(1,j,nin3) r=zin(1,j,nin4) s=zin(2,j,nin4) r4=(s - r)*rt2i s4=(r + s)*rt2i r5=zin(1,j,nin5) s5=zin(2,j,nin5) r25 = r2 - r5 r34 = r3 + r4 s25 = s2 + s5 s34 = s4 - s3 zout(1,j,nout1) = r1 + r25 + r34 r = r1 + cos2*r25 + cos4*r34 s = sin2*s25 + sin4*s34 zout(1,j,nout2) = r - s zout(1,j,nout5) = r + s r = r1 + cos4*r25 + cos2*r34 s = sin4*s25 - sin2*s34 zout(1,j,nout3) = r - s zout(1,j,nout4) = r + s r25 = r2 + r5 r34 = r3 - r4 s25 = s2 - s5 s34 = s3 + s4 zout(2,j,nout1) = s1 + s25 - s34 r = s1 + cos2*s25 - cos4*s34 s = sin2*r25 + sin4*r34 zout(2,j,nout2) = r + s zout(2,j,nout5) = r - s r = s1 + cos4*s25 - cos2*s34 s = sin4*r25 - sin2*r34 zout(2,j,nout3) = r + s zout(2,j,nout4) = r - s 5020 continue endif else ias=ia-1 itt=ias*before itrig=itt+1 cr2=trig(1,itrig) ci2=trig(2,itrig) itrig=itrig+itt cr3=trig(1,itrig) ci3=trig(2,itrig) itrig=itrig+itt cr4=trig(1,itrig) ci4=trig(2,itrig) itrig=itrig+itt cr5=trig(1,itrig) ci5=trig(2,itrig) nin1=ia-after nout1=ia-atn do 5100,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after do 5100,j=1,nfft r1=zin(1,j,nin1) s1=zin(2,j,nin1) r=zin(1,j,nin2) s=zin(2,j,nin2) r2=r*cr2 - s*ci2 s2=r*ci2 + s*cr2 r=zin(1,j,nin3) s=zin(2,j,nin3) r3=r*cr3 - s*ci3 s3=r*ci3 + s*cr3 r=zin(1,j,nin4) s=zin(2,j,nin4) r4=r*cr4 - s*ci4 s4=r*ci4 + s*cr4 r=zin(1,j,nin5) s=zin(2,j,nin5) r5=r*cr5 - s*ci5 s5=r*ci5 + s*cr5 r25 = r2 + r5 r34 = r3 + r4 s25 = s2 - s5 s34 = s3 - s4 zout(1,j,nout1) = r1 + r25 + r34 r = r1 + cos2*r25 + cos4*r34 s = sin2*s25 + sin4*s34 zout(1,j,nout2) = r - s zout(1,j,nout5) = r + s r = r1 + cos4*r25 + cos2*r34 s = sin4*s25 - sin2*s34 zout(1,j,nout3) = r - s zout(1,j,nout4) = r + s r25 = r2 - r5 r34 = r3 - r4 s25 = s2 + s5 s34 = s3 + s4 zout(2,j,nout1) = s1 + s25 + s34 r = s1 + cos2*s25 + cos4*s34 s = sin2*r25 + sin4*r34 zout(2,j,nout2) = r + s zout(2,j,nout5) = r - s r = s1 + cos4*s25 + cos2*s34 s = sin4*r25 - sin2*r34 zout(2,j,nout3) = r + s zout(2,j,nout4) = r - s 5100 continue endif 5000 continue else if (now.eq.6) then ! .5d0*sqrt(3.d0) bb=isign*0.8660254037844387d0 ia=1 nin1=ia-after nout1=ia-atn do 6120,ib=1,before nin1=nin1+after nin2=nin1+atb nin3=nin2+atb nin4=nin3+atb nin5=nin4+atb nin6=nin5+atb nout1=nout1+atn nout2=nout1+after nout3=nout2+after nout4=nout3+after nout5=nout4+after nout6=nout5+after do 6120,j=1,nfft r2=zin(1,j,nin3) s2=zin(2,j,nin3) r3=zin(1,j,nin5) s3=zin(2,j,nin5) r=r2 + r3 s=s2 + s3 r1=zin(1,j,nin1) s1=zin(2,j,nin1) ur1 = r + r1 ui1 = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r=r2-r3 s=s2-s3 ur2 = r1 - s*bb ui2 = s1 + r*bb ur3 = r1 + s*bb ui3 = s1 - r*bb r2=zin(1,j,nin6) s2=zin(2,j,nin6) r3=zin(1,j,nin2) s3=zin(2,j,nin2) r=r2 + r3 s=s2 + s3 r1=zin(1,j,nin4) s1=zin(2,j,nin4) vr1 = r + r1 vi1 = s + s1 r1=r1 - .5d0*r s1=s1 - .5d0*s r=r2-r3 s=s2-s3 vr2 = r1 - s*bb vi2 = s1 + r*bb vr3 = r1 + s*bb vi3 = s1 - r*bb zout(1,j,nout1)=ur1+vr1 zout(2,j,nout1)=ui1+vi1 zout(1,j,nout5)=ur2+vr2 zout(2,j,nout5)=ui2+vi2 zout(1,j,nout3)=ur3+vr3 zout(2,j,nout3)=ui3+vi3 zout(1,j,nout4)=ur1-vr1 zout(2,j,nout4)=ui1-vi1 zout(1,j,nout2)=ur2-vr2 zout(2,j,nout2)=ui2-vi2 zout(1,j,nout6)=ur3-vr3 zout(2,j,nout6)=ui3-vi3 6120 continue else stop 'error fftstp' endif return end subroutine zero(n,zmpi) implicit real*8 (a-h,o-z) dimension zmpi(n) do 10,i=1,n 10 zmpi(i)=0.d0 return end subroutine slice(nproc,iproc,m1,m2,m3,n1,n2,n3,md1,md2,md3,nd1,nd2,nd3,zf,zr) ! Subroutine was not tested after modifications! ! Write sliced Fourrier and Real space arrays for plotting purposes implicit real*8 (a-h,o-z) include 'mpif.h' dimension zf(2,md1,md3,md2/nproc),zr(2,nd1,nd2,nd3/nproc) REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zrall REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:) :: zfall allocate(zrall(2,nd1,nd2,nd3),zfall(2,md1,md3,md2)) if (nproc.gt.1) then call MPI_GATHER(zr,2*nd1*nd2*nd3/nproc,MPI_double_precision, & zrall,2*nd1*nd2*nd3/nproc,MPI_double_precision, & 0,MPI_COMM_WORLD,ierr) call MPI_GATHER(zf,2*md1*md2*md3/nproc,MPI_double_precision, & zfall,2*md1*md2*md3/nproc,MPI_double_precision, & 0,MPI_COMM_WORLD,ierr) else call mycopy(2*nd1*nd2*nd3,zr,zrall) call mycopy(2*md1*md2*md3,zf,zfall) endif if (iproc.eq.0) then ! input zr: i3=1 do i2=1,n2 do i1=1,n1 tt=-(-1)**(i1+i2+i3) write(120,'(2(i4),2(x,e12.5))') i1,i2,tt*zrall(1,i1,i2,i3), & tt*zrall(2,i1,i2,i3) enddo write(120,*) ' ' enddo i2=1 do i3=1,n3 do i1=1,n1 tt=-(-1)**(i1+i2+i3) write(130,'(2(i4),2(x,e12.5))') i1,i3,tt*zrall(1,i1,i2,i3), & tt*zrall(2,i1,i2,i3) enddo write(130,*) ' ' enddo i1=1 do i3=1,n3 do i2=1,n2 tt=-(-1)**(i1+i2+i3) write(230,'(2(i4),2(x,e12.5))') i2,i3,tt*zrall(1,i1,i2,i3), & tt*zrall(2,i1,i2,i3) enddo write(230,*) ' ' enddo ! output zf: i3=m3/2+1 do i2=1,m2 do i1=1,m1 write(12,'(2(i4),2(x,e12.5))') & i1,i2,zfall(1,i1,i3,i2),zfall(2,i1,i3,i2) enddo write(12,*) ' ' enddo i2=m2/2+1 do i3=1,m3 do i1=1,m1 write(13,'(2(i4),2(x,e12.5))') & i1,i3,zfall(1,i1,i3,i2),zfall(2,i1,i3,i2) enddo write(13,*) ' ' enddo i1=m1/2+1 do i3=1,m3 do i2=1,m2 write(23,'(2(i4),2(x,e12.5))') & i2,i3,zfall(1,i1,i3,i2),zfall(2,i1,i3,i2) enddo write(23,*) ' ' enddo endif deallocate(zrall,zfall) return end subroutine mycopy(n,x,y) implicit real*8 (a-h,o-z) dimension x(n),y(n) do i=1,n y(i)=x(i) enddo return end